[MDEV-24021] Poor optimization for SELECT DISTINCT ... ORDER BY LIMIT 1 - Jira

XML

Word

Printable

Consider an example:

create table i (

  id int,

  sku int,

  col1 int,

  key(id)

);

insert into i select seq, seq, seq from seq_1_to_10000;

create table s (

  sku int,

  col1 int,

  key(sku)

);

insert into s select seq, seq from seq_1_to_20000;

explain

SELECT

  DISTINCT i.col1

FROM i JOIN s

WHERE

  i.sku = s.sku

ORDER BY

  i.id DESC LIMIT 1;

The query plan for this is to use the index over (i.id) to satisfy the ORDER BY ... LIMIT (and shortcut the execution early):

+------+-------------+-------+-------+---------------+------+---------+----------+------+------------------------------+

| id   | select_type | table | type  | possible_keys | key  | key_len | ref      | rows | Extra                        |

+------+-------------+-------+-------+---------------+------+---------+----------+------+------------------------------+

|    1 | SIMPLE      | i     | index | NULL          | id   | 5       | NULL     | 1    | Using where; Using temporary |

|    1 | SIMPLE      | s     | ref   | sku           | sku  | 5       | j5.i.sku | 1    | Using index; Distinct        |

+------+-------------+-------+-------+---------------+------+---------+----------+------+------------------------------+

The problem is, handling DISTINCT actually prevents the short-cutting.

MariaDB's duplicate-removal algorithm enumerates the whole join output.

analyze format=json

SELECT DISTINCT i.col1 FROM i JOIN s WHERE i.sku = s.sku

ORDER BY i.id DESC LIMIT 1;

ANALYZE FORMAT=JSON shows:

  "query_block": {

    "select_id": 1,

    "r_loops": 1,

    "r_total_time_ms": 650.5171852,

    "temporary_table": {

      "table": {

        "table_name": "i",

        "access_type": "index",

        "key": "id",

        "key_length": "5",

        "used_key_parts": ["id"],

        "r_loops": 1,

        "rows": 10266,

        "r_rows": 10000,

        "r_table_time_ms": 399.2319244,

        "r_other_time_ms": 10.25677143,

        "filtered": 100,

        "r_filtered": 100,

        "attached_condition": "i.sku is not null"

},

      "table": {

        "table_name": "s",

        "access_type": "ref",

        "possible_keys": ["sku"],

        "key": "sku",

        "key_length": "5",

        "used_key_parts": ["sku"],

        "ref": ["j5.i.sku"],

        "r_loops": 10000,

        "rows": 1,

        "r_rows": 1,

        "r_table_time_ms": 227.2785918,

        "r_other_time_ms": 13.72163759,

        "filtered": 100,

        "r_filtered": 100,

        "using_index": true,

        "distinct": true

Note the "r_rows": 10000. It has computed the whole join output.

Let's run the same query without DISTINCT:

analyze format=json

SELECT i.col1 FROM i JOIN s WHERE i.sku = s.sku

ORDER BY i.id DESC LIMIT 1;

  "query_block": {

    "select_id": 1,

    "r_loops": 1,

    "r_total_time_ms": 0.626986227,

    "table": {

      "table_name": "i",

      "access_type": "index",

      "key": "id",

      "key_length": "5",

      "used_key_parts": ["id"],

      "r_loops": 1,

      "rows": 10266,

      "r_rows": 1,

      "r_table_time_ms": 0.314612616,

      "r_other_time_ms": 0.023000019,

      "filtered": 100,

      "r_filtered": 100,

      "attached_condition": "i.sku is not null"

},

    "table": {

      "table_name": "s",

      "access_type": "ref",

      "possible_keys": ["sku"],

      "key": "sku",

      "key_length": "5",

      "used_key_parts": ["sku"],

      "ref": ["j5.i.sku"],

      "r_loops": 1,

      "rows": 1,

      "r_rows": 1,

      "r_table_time_ms": 0.136636567,

      "r_other_time_ms": 0.125355838,

      "filtered": 100,

      "r_filtered": 100,

      "using_index": true

Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.