Uploaded image for project: 'MariaDB Server'
  1. MariaDB Server
  2. MDEV-24021

Poor optimization for SELECT DISTINCT ... ORDER BY LIMIT 1

    XMLWordPrintable

Details

    Description

      Consider an example:

      create table i (
        id int,
        sku int,
        col1 int,
        key(id)
      );
      insert into i select seq, seq, seq from seq_1_to_10000;
      

      create table s (
        sku int,
        col1 int,
        key(sku)
      );
      insert into s select seq, seq from seq_1_to_20000;
      

      explain
      SELECT 
        DISTINCT i.col1 
      FROM i JOIN s 
      WHERE 
        i.sku = s.sku 
      ORDER BY 
        i.id DESC LIMIT 1;
      

      The query plan for this is to use the index over (i.id) to satisfy the ORDER BY ... LIMIT (and shortcut the execution early):

      +------+-------------+-------+-------+---------------+------+---------+----------+------+------------------------------+
      | id   | select_type | table | type  | possible_keys | key  | key_len | ref      | rows | Extra                        |
      +------+-------------+-------+-------+---------------+------+---------+----------+------+------------------------------+
      |    1 | SIMPLE      | i     | index | NULL          | id   | 5       | NULL     | 1    | Using where; Using temporary |
      |    1 | SIMPLE      | s     | ref   | sku           | sku  | 5       | j5.i.sku | 1    | Using index; Distinct        |
      +------+-------------+-------+-------+---------------+------+---------+----------+------+------------------------------+
      

      The problem is, handling DISTINCT actually prevents the short-cutting.

      MariaDB's duplicate-removal algorithm enumerates the whole join output.

      analyze format=json
      SELECT DISTINCT i.col1 FROM i JOIN s WHERE i.sku = s.sku 
      ORDER BY i.id DESC LIMIT 1;
      

      ANALYZE FORMAT=JSON shows:
      {
        "query_block": {
          "select_id": 1,
          "r_loops": 1,
          "r_total_time_ms": 650.5171852,
          "temporary_table": {
            "table": {
              "table_name": "i",
              "access_type": "index",
              "key": "id",
              "key_length": "5",
              "used_key_parts": ["id"],
              "r_loops": 1,
              "rows": 10266,
              "r_rows": 10000,
              "r_table_time_ms": 399.2319244,
              "r_other_time_ms": 10.25677143,
              "filtered": 100,
              "r_filtered": 100,
              "attached_condition": "i.sku is not null"
            },
            "table": {
              "table_name": "s",
              "access_type": "ref",
              "possible_keys": ["sku"],
              "key": "sku",
              "key_length": "5",
              "used_key_parts": ["sku"],
              "ref": ["j5.i.sku"],
              "r_loops": 10000,
              "rows": 1,
              "r_rows": 1,
              "r_table_time_ms": 227.2785918,
              "r_other_time_ms": 13.72163759,
              "filtered": 100,
              "r_filtered": 100,
              "using_index": true,
              "distinct": true
            }
          }
        }
      }
      

      Note the "r_rows": 10000. It has computed the whole join output.

      Let's run the same query without DISTINCT:

      analyze format=json
      SELECT i.col1 FROM i JOIN s WHERE i.sku = s.sku 
      ORDER BY i.id DESC LIMIT 1;
      

      {
        "query_block": {
          "select_id": 1,
          "r_loops": 1,
          "r_total_time_ms": 0.626986227,
          "table": {
            "table_name": "i",
            "access_type": "index",
            "key": "id",
            "key_length": "5",
            "used_key_parts": ["id"],
            "r_loops": 1,
            "rows": 10266,
            "r_rows": 1,
            "r_table_time_ms": 0.314612616,
            "r_other_time_ms": 0.023000019,
            "filtered": 100,
            "r_filtered": 100,
            "attached_condition": "i.sku is not null"
          },
          "table": {
            "table_name": "s",
            "access_type": "ref",
            "possible_keys": ["sku"],
            "key": "sku",
            "key_length": "5",
            "used_key_parts": ["sku"],
            "ref": ["j5.i.sku"],
            "r_loops": 1,
            "rows": 1,
            "r_rows": 1,
            "r_table_time_ms": 0.136636567,
            "r_other_time_ms": 0.125355838,
            "filtered": 100,
            "r_filtered": 100,
            "using_index": true
          }
        }
      }
      

      Attachments

        Activity

          People

            psergei Sergei Petrunia
            psergei Sergei Petrunia
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

            Dates

              Created:
              Updated:

              Git Integration

                Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.