Uploaded image for project: 'MariaDB Server'
  1. MariaDB Server
  2. MDEV-33871

Condition is not pushed down into CTE

    XMLWordPrintable

Details

    • Bug
    • Status: Closed (View Workflow)
    • Major
    • Resolution: Not a Bug
    • 11.4.1
    • N/A
    • Optimizer
    • None

    Description

      The condition WHERE canonical = 45 does not appear to be pushed down into the hist_data CTE.

      Here is the test case that I used.

      CREATE OR REPLACE TABLE replay_results(
             event_id BIGINT UNSIGNED NOT NULL PRIMARY KEY,
             canonical INT UNSIGNED NOT NULL,
             duration DOUBLE NOT NULL,
             start_time DOUBLE NOT NULL,
             result_rows INT UNSIGNED NOT NULL,
             rows_read INT UNSIGNED NOT NULL,
             error SMALLINT UNSIGNED DEFAULT(0)
      -- This doesn't seem to help at all       
      --       , KEY canonical_key (canonical)
      ) ENGINE=MyISAM;
       
      CREATE OR REPLACE TABLE canonicals(
             canonical INT UNSIGNED NOT NULL PRIMARY KEY,
             canonical_sql MEDIUMTEXT NOT NULL
      ) ENGINE=MyISAM;
      

      --
      -- Load the replay
      LOAD DATA LOCAL INFILE 'replay.csv' INTO TABLE replay_results
      FIELDS TERMINATED BY ','
      LINES TERMINATED BY '\n'
      IGNORE 1 LINES
      (event_id, canonical, duration, start_time, result_rows, rows_read, @err)
      SET error = 0;
       
      --
      -- Load the canonicals
      --
      LOAD DATA LOCAL INFILE 'canonicals.csv' INTO TABLE canonicals
      FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"'
      LINES TERMINATED BY '\n'
      IGNORE 1 LINES;
      

      --
      -- This version has everything in a single query, takes about 16 seconds on my
      -- laptop. Adding a 'WHERE canonical = 45' to the final SELECT from
      -- json_histogram does improve the speed but it's still slower than the
      -- temporary table version would be that's after this one.
      --
      ANALYZE FORMAT=JSON WITH can_summary AS (
        SELECT
          canonical,
          MIN(duration) dur_min, MAX(duration) dur_max, SUM(duration) dur_sum, AVG(duration) dur_avg, STDDEV(duration) dur_stddev,
          MIN(rows_read) rr_min, MAX(rows_read) rr_max, SUM(rows_read) rr_sum, AVG(rows_read) rr_avg, STDDEV(rows_read) rr_stddev
        FROM replay_results GROUP BY canonical
      ), can_summary_sql AS(
      SELECT canonical_sql, can_summary.* FROM can_summary JOIN canonicals ON (canonicals.canonical = can_summary.canonical)
      ), hist_limits AS (
        SELECT canonical,
               dur_avg + dur_stddev * 3 val_max,
               dur_min val_min,
               (dur_avg + dur_stddev * 3 -  dur_min) / 100 bin_width
        FROM can_summary GROUP BY canonical
      ), hist_data AS (
        SELECT a.canonical,
             FLOOR((duration - b.val_min) / b.bin_width) bin_number,
             b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
             COUNT(duration) cnt
        FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max GROUP BY 1, 2
      ), hist_empty_range AS (
        SELECT canonical,
               seq bin_number,
               val_min + seq * bin_width bin_value
        FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2
      ), json_histogram AS(
      SELECT a.canonical, JSON_OBJECT(
        'id', a.canonical,
        'sql', cs.canonical_sql,
        'dur_min', cs.dur_min,
        'dur_max', cs.dur_max,
        'dur_sum', cs.dur_sum,
        'dur_avg', cs.dur_avg,
        'dur_stddev', cs.dur_stddev,
        'rr_min', cs.rr_min,
        'rr_max', cs.rr_max,
        'rr_sum', cs.rr_sum,
        'rr_avg', cs.rr_avg,
        'rr_stddev', cs.rr_stddev,
        'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number),
        'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number)
      ) js
      FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number)
      JOIN can_summary_sql cs ON (a.canonical = cs.canonical) GROUP BY 1
      ) SELECT JSON_ARRAYAGG(js) FROM json_histogram \G
      

      --
      -- Creating a temporary table out of can_summary speeds things up by a lot
      --
      CREATE OR REPLACE TEMPORARY TABLE can_summary AS WITH can_summary AS (
        SELECT
          canonical,
          MIN(duration) dur_min, MAX(duration) dur_max, SUM(duration) dur_sum, AVG(duration) dur_avg, STDDEV(duration) dur_stddev,
          MIN(rows_read) rr_min, MAX(rows_read) rr_max, SUM(rows_read) rr_sum, AVG(rows_read) rr_avg, STDDEV(rows_read) rr_stddev
        FROM replay_results GROUP BY canonical
      ), can_summary_sql AS(
      SELECT canonical_sql, can_summary.* FROM can_summary JOIN canonicals ON (canonicals.canonical = can_summary.canonical)
      ) SELECT * FROM can_summary_sql;
      

      ANALYZE FORMAT=JSON WITH hist_limits AS (
        SELECT canonical,
               dur_avg + dur_stddev * 3 val_max,
               dur_min val_min,
               (dur_avg + dur_stddev * 3 -  dur_min) / 100 bin_width
        FROM can_summary GROUP BY canonical
      ), hist_data AS (
        SELECT a.canonical,
             FLOOR((duration - b.val_min) / b.bin_width) bin_number,
             b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
             COUNT(duration) cnt
        FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max GROUP BY 1, 2
      ), hist_empty_range AS (
        SELECT canonical,
               seq bin_number,
               val_min + seq * bin_width bin_value
        FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2
      ), json_histogram AS(
      SELECT a.canonical, JSON_OBJECT(
        'id', a.canonical,
        'sql', cs.canonical_sql,
        'dur_min', cs.dur_min,
        'dur_max', cs.dur_max,
        'dur_sum', cs.dur_sum,
        'dur_avg', cs.dur_avg,
        'dur_stddev', cs.dur_stddev,
        'rr_min', cs.rr_min,
        'rr_max', cs.rr_max,
        'rr_sum', cs.rr_sum,
        'rr_avg', cs.rr_avg,
        'rr_stddev', cs.rr_stddev,
        'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number),
        'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number)
      ) js
      FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number)
      JOIN can_summary cs ON (a.canonical = cs.canonical) GROUP BY 1
      ) SELECT JSON_ARRAYAGG(js) FROM json_histogram \G
      

      --
      -- Biggest group is canonical 11: 2169945 values
      -- Smallest group is canonical 45: 1 value
      --
      SELECT canonical, COUNT(*) FROM replay_results GROUP BY canonical ORDER by 2 DESC LIMIT 1;
      SELECT canonical, COUNT(*) FROM replay_results GROUP BY canonical ORDER by 2 ASC LIMIT 1;
       
      --
      -- Filtering to a single group does not seem to speed things up
      --
      ANALYZE FORMAT=JSON WITH hist_limits AS (
        SELECT canonical,
               dur_avg + dur_stddev * 3 val_max,
               dur_min val_min,
               (dur_avg + dur_stddev * 3 -  dur_min) / 100 bin_width
        FROM can_summary GROUP BY canonical
      ), hist_data AS (
        SELECT a.canonical,
             FLOOR((duration - b.val_min) / b.bin_width) bin_number,
             b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
             COUNT(duration) cnt
        FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max GROUP BY 1, 2
      ), hist_empty_range AS (
        SELECT canonical,
               seq bin_number,
               val_min + seq * bin_width bin_value
        FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2
      ), json_histogram AS(
      SELECT a.canonical, JSON_OBJECT(
        'id', a.canonical,
        'sql', cs.canonical_sql,
        'dur_min', cs.dur_min,
        'dur_max', cs.dur_max,
        'dur_sum', cs.dur_sum,
        'dur_avg', cs.dur_avg,
        'dur_stddev', cs.dur_stddev,
        'rr_min', cs.rr_min,
        'rr_max', cs.rr_max,
        'rr_sum', cs.rr_sum,
        'rr_avg', cs.rr_avg,
        'rr_stddev', cs.rr_stddev,
        'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number),
        'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number)
      ) js
      FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number)
      JOIN can_summary cs ON (a.canonical = cs.canonical) GROUP BY 1
      ) SELECT JSON_ARRAYAGG(js) FROM json_histogram WHERE canonical = 45 \G
      

      --
      -- Adding the filtering condition to hist_data makes it fast
      --
      ANALYZE FORMAT=JSON WITH hist_limits AS (
        SELECT canonical,
               dur_avg + dur_stddev * 3 val_max,
               dur_min val_min,
               (dur_avg + dur_stddev * 3 -  dur_min) / 100 bin_width
        FROM can_summary GROUP BY canonical
      ), hist_data AS (
        SELECT a.canonical,
             FLOOR((duration - b.val_min) / b.bin_width) bin_number,
             b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
             COUNT(duration) cnt
        FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max AND a.canonical = 45 GROUP BY 1, 2
      ), hist_empty_range AS (
        SELECT canonical,
               seq bin_number,
               val_min + seq * bin_width bin_value
        FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2
      ), json_histogram AS(
      SELECT a.canonical, JSON_OBJECT(
        'id', a.canonical,
        'sql', cs.canonical_sql,
        'dur_min', cs.dur_min,
        'dur_max', cs.dur_max,
        'dur_sum', cs.dur_sum,
        'dur_avg', cs.dur_avg,
        'dur_stddev', cs.dur_stddev,
        'rr_min', cs.rr_min,
        'rr_max', cs.rr_max,
        'rr_sum', cs.rr_sum,
        'rr_avg', cs.rr_avg,
        'rr_stddev', cs.rr_stddev,
        'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number),
        'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number)
      ) js
      FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number)
      JOIN can_summary cs ON (a.canonical = cs.canonical) GROUP BY 1
      ) SELECT JSON_ARRAYAGG(js) FROM json_histogram WHERE canonical = 45 \G
      

      --
      -- Creating an index doesn't seem to improve the situation
      --
      CREATE INDEX canonical_idx ON replay_results(canonical);
       
      ANALYZE FORMAT=JSON WITH hist_limits AS (
        SELECT canonical,
               dur_avg + dur_stddev * 3 val_max,
               dur_min val_min,
               (dur_avg + dur_stddev * 3 -  dur_min) / 100 bin_width
        FROM can_summary GROUP BY canonical
      ), hist_data AS (
        SELECT a.canonical,
             FLOOR((duration - b.val_min) / b.bin_width) bin_number,
             b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
             COUNT(duration) cnt
        FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max GROUP BY 1, 2
      ), hist_empty_range AS (
        SELECT canonical,
               seq bin_number,
               val_min + seq * bin_width bin_value
        FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2
      ), json_histogram AS(
      SELECT a.canonical, JSON_OBJECT(
        'id', a.canonical,
        'sql', cs.canonical_sql,
        'dur_min', cs.dur_min,
        'dur_max', cs.dur_max,
        'dur_sum', cs.dur_sum,
        'dur_avg', cs.dur_avg,
        'dur_stddev', cs.dur_stddev,
        'rr_min', cs.rr_min,
        'rr_max', cs.rr_max,
        'rr_sum', cs.rr_sum,
        'rr_avg', cs.rr_avg,
        'rr_stddev', cs.rr_stddev,
        'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number),
        'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number)
      ) js
      FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number)
      JOIN can_summary cs ON (a.canonical = cs.canonical) GROUP BY 1
      ) SELECT JSON_ARRAYAGG(js) FROM json_histogram WHERE canonical = 45 \G
       
      ALTER TABLE replay_results ENGINE=InnoDB;
      ALTER TABLE canonicals ENGINE=InnoDB;
       
      --
      -- InnoDB is several orders of magnitude slower than MyISAM or Aria
      --
      ANALYZE FORMAT=JSON WITH hist_limits AS (
        SELECT canonical,
               dur_avg + dur_stddev * 3 val_max,
               dur_min val_min,
               (dur_avg + dur_stddev * 3 -  dur_min) / 100 bin_width
        FROM can_summary GROUP BY canonical
      ), hist_data AS (
        SELECT a.canonical,
             FLOOR((duration - b.val_min) / b.bin_width) bin_number,
             b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
             COUNT(duration) cnt
        FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max GROUP BY 1, 2
      ), hist_empty_range AS (
        SELECT canonical,
               seq bin_number,
               val_min + seq * bin_width bin_value
        FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2
      ), json_histogram AS(
      SELECT a.canonical, JSON_OBJECT(
        'id', a.canonical,
        'sql', cs.canonical_sql,
        'dur_min', cs.dur_min,
        'dur_max', cs.dur_max,
        'dur_sum', cs.dur_sum,
        'dur_avg', cs.dur_avg,
        'dur_stddev', cs.dur_stddev,
        'rr_min', cs.rr_min,
        'rr_max', cs.rr_max,
        'rr_sum', cs.rr_sum,
        'rr_avg', cs.rr_avg,
        'rr_stddev', cs.rr_stddev,
        'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number),
        'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number)
      ) js
      FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number)
      JOIN can_summary cs ON (a.canonical = cs.canonical) GROUP BY 1
      ) SELECT JSON_ARRAYAGG(js) FROM json_histogram WHERE canonical = 45 \G
      

      I have attached the execution of this SQL with the original data set I have and added a few megabytes worth of the data from replay.csv. The original replay.csv is about 482MiB and I can upload it if needed.

      Attachments

        1. canonicals.csv
          4 kB
        2. output.txt
          102 kB
        3. replay.csv
          3.26 MB

        Issue Links

          Activity

            People

              Gosselin Dave Gosselin
              markus makela markus makela
              Votes:
              1 Vote for this issue
              Watchers:
              6 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Git Integration

                  Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.