Details
-
Bug
-
Status: Closed (View Workflow)
-
Major
-
Resolution: Not a Bug
-
11.4.1
-
None
Description
The condition WHERE canonical = 45 does not appear to be pushed down into the hist_data CTE.
Here is the test case that I used.
CREATE OR REPLACE TABLE replay_results( |
event_id BIGINT UNSIGNED NOT NULL PRIMARY KEY, |
canonical INT UNSIGNED NOT NULL, |
duration DOUBLE NOT NULL, |
start_time DOUBLE NOT NULL, |
result_rows INT UNSIGNED NOT NULL, |
rows_read INT UNSIGNED NOT NULL, |
error SMALLINT UNSIGNED DEFAULT(0) |
-- This doesn't seem to help at all
|
-- , KEY canonical_key (canonical)
|
) ENGINE=MyISAM;
|
 |
CREATE OR REPLACE TABLE canonicals( |
canonical INT UNSIGNED NOT NULL PRIMARY KEY, |
canonical_sql MEDIUMTEXT NOT NULL |
) ENGINE=MyISAM;
|
--
|
-- Load the replay
|
LOAD DATA LOCAL INFILE 'replay.csv' INTO TABLE replay_results |
FIELDS TERMINATED BY ',' |
LINES TERMINATED BY '\n' |
IGNORE 1 LINES |
(event_id, canonical, duration, start_time, result_rows, rows_read, @err)
|
SET error = 0; |
 |
--
|
-- Load the canonicals
|
--
|
LOAD DATA LOCAL INFILE 'canonicals.csv' INTO TABLE canonicals |
FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' |
LINES TERMINATED BY '\n' |
IGNORE 1 LINES; |
--
|
-- This version has everything in a single query, takes about 16 seconds on my
|
-- laptop. Adding a 'WHERE canonical = 45' to the final SELECT from
|
-- json_histogram does improve the speed but it's still slower than the
|
-- temporary table version would be that's after this one.
|
--
|
ANALYZE FORMAT=JSON WITH can_summary AS ( |
SELECT |
canonical,
|
MIN(duration) dur_min, MAX(duration) dur_max, SUM(duration) dur_sum, AVG(duration) dur_avg, STDDEV(duration) dur_stddev, |
MIN(rows_read) rr_min, MAX(rows_read) rr_max, SUM(rows_read) rr_sum, AVG(rows_read) rr_avg, STDDEV(rows_read) rr_stddev |
FROM replay_results GROUP BY canonical |
), can_summary_sql AS( |
SELECT canonical_sql, can_summary.* FROM can_summary JOIN canonicals ON (canonicals.canonical = can_summary.canonical) |
), hist_limits AS ( |
SELECT canonical, |
dur_avg + dur_stddev * 3 val_max,
|
dur_min val_min,
|
(dur_avg + dur_stddev * 3 - dur_min) / 100 bin_width
|
FROM can_summary GROUP BY canonical |
), hist_data AS ( |
SELECT a.canonical, |
FLOOR((duration - b.val_min) / b.bin_width) bin_number,
|
b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
|
COUNT(duration) cnt |
FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max GROUP BY 1, 2 |
), hist_empty_range AS ( |
SELECT canonical, |
seq bin_number,
|
val_min + seq * bin_width bin_value
|
FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2 |
), json_histogram AS( |
SELECT a.canonical, JSON_OBJECT( |
'id', a.canonical, |
'sql', cs.canonical_sql, |
'dur_min', cs.dur_min, |
'dur_max', cs.dur_max, |
'dur_sum', cs.dur_sum, |
'dur_avg', cs.dur_avg, |
'dur_stddev', cs.dur_stddev, |
'rr_min', cs.rr_min, |
'rr_max', cs.rr_max, |
'rr_sum', cs.rr_sum, |
'rr_avg', cs.rr_avg, |
'rr_stddev', cs.rr_stddev, |
'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number), |
'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number) |
) js
|
FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number) |
JOIN can_summary_sql cs ON (a.canonical = cs.canonical) GROUP BY 1 |
) SELECT JSON_ARRAYAGG(js) FROM json_histogram \G |
--
|
-- Creating a temporary table out of can_summary speeds things up by a lot
|
--
|
CREATE OR REPLACE TEMPORARY TABLE can_summary AS WITH can_summary AS ( |
SELECT |
canonical,
|
MIN(duration) dur_min, MAX(duration) dur_max, SUM(duration) dur_sum, AVG(duration) dur_avg, STDDEV(duration) dur_stddev, |
MIN(rows_read) rr_min, MAX(rows_read) rr_max, SUM(rows_read) rr_sum, AVG(rows_read) rr_avg, STDDEV(rows_read) rr_stddev |
FROM replay_results GROUP BY canonical |
), can_summary_sql AS( |
SELECT canonical_sql, can_summary.* FROM can_summary JOIN canonicals ON (canonicals.canonical = can_summary.canonical) |
) SELECT * FROM can_summary_sql; |
ANALYZE FORMAT=JSON WITH hist_limits AS ( |
SELECT canonical, |
dur_avg + dur_stddev * 3 val_max,
|
dur_min val_min,
|
(dur_avg + dur_stddev * 3 - dur_min) / 100 bin_width
|
FROM can_summary GROUP BY canonical |
), hist_data AS ( |
SELECT a.canonical, |
FLOOR((duration - b.val_min) / b.bin_width) bin_number,
|
b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
|
COUNT(duration) cnt |
FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max GROUP BY 1, 2 |
), hist_empty_range AS ( |
SELECT canonical, |
seq bin_number,
|
val_min + seq * bin_width bin_value
|
FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2 |
), json_histogram AS( |
SELECT a.canonical, JSON_OBJECT( |
'id', a.canonical, |
'sql', cs.canonical_sql, |
'dur_min', cs.dur_min, |
'dur_max', cs.dur_max, |
'dur_sum', cs.dur_sum, |
'dur_avg', cs.dur_avg, |
'dur_stddev', cs.dur_stddev, |
'rr_min', cs.rr_min, |
'rr_max', cs.rr_max, |
'rr_sum', cs.rr_sum, |
'rr_avg', cs.rr_avg, |
'rr_stddev', cs.rr_stddev, |
'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number), |
'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number) |
) js
|
FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number) |
JOIN can_summary cs ON (a.canonical = cs.canonical) GROUP BY 1 |
) SELECT JSON_ARRAYAGG(js) FROM json_histogram \G |
--
|
-- Biggest group is canonical 11: 2169945 values
|
-- Smallest group is canonical 45: 1 value
|
--
|
SELECT canonical, COUNT(*) FROM replay_results GROUP BY canonical ORDER by 2 DESC LIMIT 1; |
SELECT canonical, COUNT(*) FROM replay_results GROUP BY canonical ORDER by 2 ASC LIMIT 1; |
 |
--
|
-- Filtering to a single group does not seem to speed things up
|
--
|
ANALYZE FORMAT=JSON WITH hist_limits AS ( |
SELECT canonical, |
dur_avg + dur_stddev * 3 val_max,
|
dur_min val_min,
|
(dur_avg + dur_stddev * 3 - dur_min) / 100 bin_width
|
FROM can_summary GROUP BY canonical |
), hist_data AS ( |
SELECT a.canonical, |
FLOOR((duration - b.val_min) / b.bin_width) bin_number,
|
b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
|
COUNT(duration) cnt |
FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max GROUP BY 1, 2 |
), hist_empty_range AS ( |
SELECT canonical, |
seq bin_number,
|
val_min + seq * bin_width bin_value
|
FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2 |
), json_histogram AS( |
SELECT a.canonical, JSON_OBJECT( |
'id', a.canonical, |
'sql', cs.canonical_sql, |
'dur_min', cs.dur_min, |
'dur_max', cs.dur_max, |
'dur_sum', cs.dur_sum, |
'dur_avg', cs.dur_avg, |
'dur_stddev', cs.dur_stddev, |
'rr_min', cs.rr_min, |
'rr_max', cs.rr_max, |
'rr_sum', cs.rr_sum, |
'rr_avg', cs.rr_avg, |
'rr_stddev', cs.rr_stddev, |
'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number), |
'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number) |
) js
|
FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number) |
JOIN can_summary cs ON (a.canonical = cs.canonical) GROUP BY 1 |
) SELECT JSON_ARRAYAGG(js) FROM json_histogram WHERE canonical = 45 \G |
--
|
-- Adding the filtering condition to hist_data makes it fast
|
--
|
ANALYZE FORMAT=JSON WITH hist_limits AS ( |
SELECT canonical, |
dur_avg + dur_stddev * 3 val_max,
|
dur_min val_min,
|
(dur_avg + dur_stddev * 3 - dur_min) / 100 bin_width
|
FROM can_summary GROUP BY canonical |
), hist_data AS ( |
SELECT a.canonical, |
FLOOR((duration - b.val_min) / b.bin_width) bin_number,
|
b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
|
COUNT(duration) cnt |
FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max AND a.canonical = 45 GROUP BY 1, 2 |
), hist_empty_range AS ( |
SELECT canonical, |
seq bin_number,
|
val_min + seq * bin_width bin_value
|
FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2 |
), json_histogram AS( |
SELECT a.canonical, JSON_OBJECT( |
'id', a.canonical, |
'sql', cs.canonical_sql, |
'dur_min', cs.dur_min, |
'dur_max', cs.dur_max, |
'dur_sum', cs.dur_sum, |
'dur_avg', cs.dur_avg, |
'dur_stddev', cs.dur_stddev, |
'rr_min', cs.rr_min, |
'rr_max', cs.rr_max, |
'rr_sum', cs.rr_sum, |
'rr_avg', cs.rr_avg, |
'rr_stddev', cs.rr_stddev, |
'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number), |
'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number) |
) js
|
FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number) |
JOIN can_summary cs ON (a.canonical = cs.canonical) GROUP BY 1 |
) SELECT JSON_ARRAYAGG(js) FROM json_histogram WHERE canonical = 45 \G |
--
|
-- Creating an index doesn't seem to improve the situation
|
--
|
CREATE INDEX canonical_idx ON replay_results(canonical);
|
 |
ANALYZE FORMAT=JSON WITH hist_limits AS (
|
SELECT canonical,
|
dur_avg + dur_stddev * 3 val_max,
|
dur_min val_min,
|
(dur_avg + dur_stddev * 3 - dur_min) / 100 bin_width
|
FROM can_summary GROUP BY canonical
|
), hist_data AS (
|
SELECT a.canonical,
|
FLOOR((duration - b.val_min) / b.bin_width) bin_number,
|
b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
|
COUNT(duration) cnt
|
FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max GROUP BY 1, 2
|
), hist_empty_range AS (
|
SELECT canonical,
|
seq bin_number,
|
val_min + seq * bin_width bin_value
|
FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2
|
), json_histogram AS(
|
SELECT a.canonical, JSON_OBJECT(
|
'id', a.canonical,
|
'sql', cs.canonical_sql,
|
'dur_min', cs.dur_min,
|
'dur_max', cs.dur_max,
|
'dur_sum', cs.dur_sum,
|
'dur_avg', cs.dur_avg,
|
'dur_stddev', cs.dur_stddev,
|
'rr_min', cs.rr_min,
|
'rr_max', cs.rr_max,
|
'rr_sum', cs.rr_sum,
|
'rr_avg', cs.rr_avg,
|
'rr_stddev', cs.rr_stddev,
|
'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number),
|
'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number)
|
) js
|
FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number)
|
JOIN can_summary cs ON (a.canonical = cs.canonical) GROUP BY 1
|
) SELECT JSON_ARRAYAGG(js) FROM json_histogram WHERE canonical = 45 \G
|
 |
ALTER TABLE replay_results ENGINE=InnoDB;
|
ALTER TABLE canonicals ENGINE=InnoDB;
|
 |
--
|
-- InnoDB is several orders of magnitude slower than MyISAM or Aria
|
--
|
ANALYZE FORMAT=JSON WITH hist_limits AS (
|
SELECT canonical,
|
dur_avg + dur_stddev * 3 val_max,
|
dur_min val_min,
|
(dur_avg + dur_stddev * 3 - dur_min) / 100 bin_width
|
FROM can_summary GROUP BY canonical
|
), hist_data AS (
|
SELECT a.canonical,
|
FLOOR((duration - b.val_min) / b.bin_width) bin_number,
|
b.val_min + FLOOR((duration - b.val_min) / b.bin_width) * b.bin_width bin_value,
|
COUNT(duration) cnt
|
FROM replay_results a JOIN hist_limits b ON (a.canonical = b.canonical) WHERE a.duration < b.val_max GROUP BY 1, 2
|
), hist_empty_range AS (
|
SELECT canonical,
|
seq bin_number,
|
val_min + seq * bin_width bin_value
|
FROM hist_limits JOIN seq_0_to_100 GROUP BY 1, 2
|
), json_histogram AS(
|
SELECT a.canonical, JSON_OBJECT(
|
'id', a.canonical,
|
'sql', cs.canonical_sql,
|
'dur_min', cs.dur_min,
|
'dur_max', cs.dur_max,
|
'dur_sum', cs.dur_sum,
|
'dur_avg', cs.dur_avg,
|
'dur_stddev', cs.dur_stddev,
|
'rr_min', cs.rr_min,
|
'rr_max', cs.rr_max,
|
'rr_sum', cs.rr_sum,
|
'rr_avg', cs.rr_avg,
|
'rr_stddev', cs.rr_stddev,
|
'bins', JSON_ARRAYAGG(a.bin_value ORDER BY a.bin_number),
|
'counts', JSON_ARRAYAGG(COALESCE(b.cnt, 0) ORDER BY a.bin_number)
|
) js
|
FROM hist_empty_range a LEFT JOIN hist_data b ON (a.canonical = b.canonical AND a.bin_number = b.bin_number)
|
JOIN can_summary cs ON (a.canonical = cs.canonical) GROUP BY 1
|
) SELECT JSON_ARRAYAGG(js) FROM json_histogram WHERE canonical = 45 \G
|
I have attached the execution of this SQL with the original data set I have and added a few megabytes worth of the data from replay.csv. The original replay.csv is about 482MiB and I can upload it if needed.
Attachments
Issue Links
- causes
-
MDEV-33964 Consider Implementing Condition Pushdown to Inner Tables of Outer Joins
- Open