Uploaded image for project: 'MariaDB Server'
  1. MariaDB Server
  2. MDEV-29313

SELECT transation get's stuck temporally related to OPTIMIZATION MetaData Lock

Details

    • Bug
    • Status: Open (View Workflow)
    • Major
    • Resolution: Unresolved
    • 10.5.13
    • None
    • Locking
    • None
    • RHEL8

    Description

      We have a nightly process that optimizes specific high update rate tables. At the same time, and synchronous to that optimization process, select queries come in very frequently.

      There are cases, where a query that should run about 2-5 seconds will get stuck indefinitely. These queries come in just before, and possibly nano or milliseconds before an optimization statement, which imposes a METADATA LOCK on all the pending selects against the table, including possibly this one, or maybe not. We are not sure.

      However, once the METADATA LOCK clears, the query will be shown in running mode, but in fact it is not running. There was a bug resolved in 10.5.16 associated with a missed flagging/unflagging of certain queries during some replication based events that sounded so similar, that I thought this issue would be resolved by that update. Unfortunately, this turned out to not be the case.

      We are still seeing the selects that appear to be running, but in fact are not happening approximately once a week. I suspect that it's a temporal issue related to the flagging/unflagging of METADATA LOCKS that is causing the issue. That MariaDB unlocks the select from the users perspective, but it remains in fact locked from an internals perspective.

      Attachments

        1. global_status.txt
          15 kB
        2. global_variables.txt
          19 kB
        3. my.cnf
          8 kB

        Activity

          TheWitness Larry Adams created issue -
          serg Sergei Golubchik made changes -
          Field Original Value New Value
          Assignee Lena Startseva [ JIRAUSER50478 ]
          elenst Elena Stepanova made changes -
          Status Open [ 1 ] Needs Feedback [ 10501 ]
          TheWitness Larry Adams made changes -
          Comment [ Not posting as a file. Hope that's okay. First server.cnf. Replication is off due to statement replication bugs. Row replication was more stable, but the system could not keep up and lagged behind. Waiting on contract before re-instituting replication.

          #
          # These groups are read by MariaDB server.
          # Use it for options that only the server (but not clients) should see
          #
          # See the examples of server my.cnf files in /usr/share/mysql/
          #

          # this is only for the mysqld standalone daemon
          [mysqld]
          # NVMe Storage location
          datadir = /data/mysql
          #innodb_force_recovery=1

          # Logging for MariaDB
          log_error = /var/log/mariadb/mariadb.log
          log_warnings = 1
          socket = /data/mysql/mysql.sock
          tmpdir = /tmp
          innodb_tmpdir = /tmp

          character-set-server = utf8mb4
          collation-server = utf8mb4_unicode_ci
          max_heap_table_size = 4G
          max_allowed_packet = 128M
          tmp_table_size = 4G
          join_buffer_size = 3G
          sort_buffer_size = 400M
          max_connections = 1000
          table_cache = 2000
          table_definition_cache = 2000

          # Reduce the 2013 errors
          net_read_timeout = 0

          # slow log
          slow_query_log_file = /var/log/mariadb/mariadb-slow.log
          slow_query_log = ON
          long_query_time = 10

          # Secure file transfer location
          secure_file_priv='/local/mnt/workspace/mariadb'

          # Enable the Performance Schema
          performance_schema = ON

          # We still have some MyISAM tables for Heurisitcs
          key_buffer_size=1000000000

          # Work Around MariaDB Bug in 10.5 Series
          optimizer_switch="index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_merge_sort_intersection=off,engine_condition_pushdown=off,index_condition_pushdown=on,derived_merge=on,derived_with_keys=on,firstmatch=on,loosescan=on,materialization=on,in_to_exists=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr=off,mrr_cost_based=off,mrr_sort_keys=off,outer_join_with_cache=on,semijoin_with_cache=on,join_cache_incremental=on,join_cache_hashed=on,join_cache_bka=on,optimize_join_buffer_size=on,table_elimination=on,extended_keys=off,exists_to_in=on,orderby_uses_equalities=on,condition_pushdown_for_derived=on,split_materialized=on,condition_pushdown_for_subquery=on,rowid_filter=on,condition_pushdown_from_having=on,not_null_range_scan=off"

          # query cache
          query_cache_size=0
          query_cache_type=0

          # important for compatibility
          sql_mode=NO_ENGINE_SUBSTITUTION,NO_AUTO_CREATE_USER

          # Flush as often as possible to increase performance on high
          # update tables
          innodb_max_dirty_pages_pct_lwm = 0.0001
          innodb_max_dirty_pages_pct = 0.0001

          # Thread handling
          thread_handling = pool-of-threads
          # Testing this setting which seems to have changed after the outage on
          # Week 11. Reverting to thread-per-connection
          #thread_handling = thread-per-connection
          thread_pool_size = 112

          # innodb settings
          innodb_flush_log_at_timeout = 3
          innodb_file_per_table = ON
          innodb_stats_on_metadata = OFF

          innodb_buffer_pool_size = 1000GB
          innodb_sort_buffer_size = 67108864

          innodb_log_file_size = 100G
          innodb_flush_log_at_trx_commit = 2
          innodb_flush_log_at_timeout = 3
          innodb_lock_wait_timeout = 50
          innodb_open_files = 8000

          # Recent Tuning exercise
          innodb_fast_shutdown = 1
          innodb_change_buffering = none
          innodb_flush_sync = 1
          innodb_lru_flush_size = 16384
          innodb_lru_scan_depth = 32768
          innodb_flushing_avg_loops = 50
          innodb_purge_threads = 10
          innodb_log_buffer_size = 64M

          # for SSD's
          innodb_doublewrite = 0
          innodb_use_atomic_writes = 1
          innodb_use_native_aio = 1
          innodb_read_io_threads = 64
          innodb_write_io_threads = 64
          innodb_io_capacity = 35000
          innodb_io_capacity_max = 65000
          innodb_flush_method = O_DIRECT

          # Read Uncommitted, and new innodb log variable
          #transaction_isolation = REPEATABLE-READ
          transaction_isolation = READ-UNCOMMITTED
          innodb_online_alter_log_max_size = 10000000000

          # Misc tuning
          thread_pool_size = 256
          thread_cache_size = 128
          innodb_lock_wait_timeout = 60
          table_open_cache = 12000
          table_open_cache_instances = 16

          # Replication setup
          #server-id = 02
          #gtid-domain-id = 1
          #bind-address = 10.23.248.25
          #log-bin = /data/mysql-log/rtm-db
          #log-bin-index = /data/mysql-log/rtm-db.index
          #relay-log = /data/mysql-relay/rtm-db
          #relay-log-index = /data/mysql-relay/rtm-db.index
          #binlog_format = row
          #sync_binlog = 1
          #slave_skip_errors = 1032,1062,1956,1927,1020,1205,1964

          # Ignore some less useful table for replication
          #binlog_do_db = "cacti"
          #replicate_ignore_table = "cacti.grid_processes,cacti.processes,cacti.poller_output,cacti.poller_output_boost"
          #replicate_wild_ignore_table = "cacti.poller_output%"

          # Semi-syncronous replication setup
          #rpl_semi_sync_master_enabled = ON
          #rpl_semi_sync_master_timeout = 20000
          #rpl_semi_sync_master_wait_point = AFTER_SYNC

          # Parallel Replication
          #slave_parallel_mode = optimistic
          #slave_parallel_threads = 60
          #slave_parallel_max_queued = 2147483647{quote} ]
          TheWitness Larry Adams made changes -
          Attachment my.cnf [ 65616 ]
          TheWitness Larry Adams made changes -
          Attachment global_status.txt [ 65617 ]
          TheWitness Larry Adams made changes -
          Attachment global_variables.txt [ 65618 ]
          serg Sergei Golubchik made changes -
          Status Needs Feedback [ 10501 ] Open [ 1 ]

          People

            lstartseva Lena Startseva
            TheWitness Larry Adams
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated:

              Git Integration

                Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.