--echo # MDEV-5059: Wrong result (missing row) wih semijoin, join_cache_level > 2, LEFT JOIN, ORDER BY
--echo #
set join_buffer_size= 1024*1024*2;
SET join_cache_level = 3;
set optimizer_search_depth=62;
set join_buffer_space_limit=2097152;
set optimizer_switch="index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_merge_sort_intersection=off,engine_condition_pushdown=off,index_condition_pushdown=on,derived_merge=on,derived_with_keys=on,firstmatch=on,loosescan=on,materialization=on,in_to_exists=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr=off,mrr_cost_based=off,mrr_sort_keys=off,outer_join_with_cache=on,semijoin_with_cache=on,join_cache_incremental=on,join_cache_hashed=on,join_cache_bka=on,optimize_join_buffer_size=off,table_elimination=on,extended_keys=on,exists_to_in=on,orderby_uses_equalities=on,condition_pushdown_for_derived=on,split_materialized=on";
CREATE TABLE t1 (pk1 INT PRIMARY KEY, i1 INT) ENGINE=MyISAM;
OK the plans look similar but they are not exactly the same, lets analyze the output of ANALYZE FORMAT=JSON for the queries with different join_buffer_size.
When we have a smaller join_buffer_size in that case we disallow using join buffering for a table (t3 here)
Output of ANALYZE with join_buffer_size= 2097152
ANALYZE FORMAT=JSON
SELECT * FROM t1, t2 LEFT JOIN t3 ON (c3 = c2)
WHERE ( i2, pk1 ) IN ( SELECT i3, i4 FROM t3 A, t4 B);
ANALYZE
{
"query_block": {
"select_id": 1,
"r_loops": 1,
"r_total_time_ms": 0.4434,
"table": {
"table_name": "B",
"access_type": "system",
"r_loops": 0,
"rows": 1,
"r_rows": null,
"filtered": 100,
"r_filtered": null
},
"table": {
"table_name": "t1",
"access_type": "const",
"possible_keys": ["PRIMARY"],
"key": "PRIMARY",
"key_length": "4",
"used_key_parts": ["pk1"],
"ref": ["const"],
"r_loops": 0,
"rows": 1,
"r_rows": null,
"filtered": 100,
"r_filtered": null
},
"table": {
"table_name": "t2",
"access_type": "ALL",
"r_loops": 1,
"rows": 2,
"r_rows": 2,
"r_total_time_ms": 0.0201,
"filtered": 100,
"r_filtered": 100,
"attached_condition": "t2.i2 is not null"
},
"duplicates_removal": {
"block-nl-join": {
"table": {
"table_name": "A",
"access_type": "hash_index",
"possible_keys": ["i3"],
"key": "#hash#i3:i3",
"key_length": "5:5",
"used_key_parts": ["i3"],
"ref": ["test.t2.i2"],
"r_loops": 1,
"rows": 3,
"r_rows": 3,
"r_total_time_ms": 0.055,
"filtered": 66.667,
"r_filtered": 100,
"using_index": true
},
"buffer_type": "flat",
"buffer_size": "2048Kb",
"join_type": "BNLH",
"r_filtered": 100
}
},
"table": {
"table_name": "t3",
"access_type": "hash_ALL",
"possible_keys": ["c3"],
"key": "#hash#c3",
"key_length": "2",
"used_key_parts": ["c3"],
"ref": ["test.t2.c2"],
"r_loops": 2,
"rows": 3,
"r_rows": 3,
"r_total_time_ms": 0.0289,
"filtered": 33.333,
"r_filtered": 100,
"attached_condition": "trigcond(trigcond(t2.c2 is not null))"
}
}
}
Output for ANALYZE FORMAT=JSON with join_buffer_size= 262144 (gives correct result)
ANALYZE FORMAT=JSON
SELECT * FROM t1, t2 LEFT JOIN t3 ON (c3 = c2)
WHERE ( i2, pk1 ) IN ( SELECT i3, i4 FROM t3 A, t4 B);
ANALYZE
{
"query_block": {
"select_id": 1,
"r_loops": 1,
"r_total_time_ms": 0.3191,
"table": {
"table_name": "B",
"access_type": "system",
"r_loops": 0,
"rows": 1,
"r_rows": null,
"filtered": 100,
"r_filtered": null
},
"table": {
"table_name": "t1",
"access_type": "const",
"possible_keys": ["PRIMARY"],
"key": "PRIMARY",
"key_length": "4",
"used_key_parts": ["pk1"],
"ref": ["const"],
"r_loops": 0,
"rows": 1,
"r_rows": null,
"filtered": 100,
"r_filtered": null
},
"table": {
"table_name": "t2",
"access_type": "ALL",
"r_loops": 1,
"rows": 2,
"r_rows": 2,
"r_total_time_ms": 0.0172,
"filtered": 100,
"r_filtered": 100,
"attached_condition": "t2.i2 is not null"
},
"duplicates_removal": {
"block-nl-join": {
"table": {
"table_name": "A",
"access_type": "hash_index",
"possible_keys": ["i3"],
"key": "#hash#i3:i3",
"key_length": "5:5",
"used_key_parts": ["i3"],
"ref": ["test.t2.i2"],
"r_loops": 1,
"rows": 3,
"r_rows": 3,
"r_total_time_ms": 0.0422,
"filtered": 66.667,
"r_filtered": 100,
"using_index": true
},
"buffer_type": "flat",
"buffer_size": "256Kb",
"join_type": "BNLH",
"r_filtered": 100
}
},
"block-nl-join": {
"table": {
"table_name": "t3",
"access_type": "hash_ALL",
"possible_keys": ["c3"],
"key": "#hash#c3",
"key_length": "2",
"used_key_parts": ["c3"],
"ref": ["test.t2.c2"],
"r_loops": 1,
"rows": 3,
"r_rows": 3,
"r_total_time_ms": 0.0132,
"filtered": 33.333,
"r_filtered": 100
},
"buffer_type": "flat",
"buffer_size": "256Kb",
"join_type": "BNLH",
"attached_condition": "trigcond(trigcond(t2.c2 is not null))",
"r_filtered": null
}
}
}
The problem with the first output (with smaller join buffer), it is still using hash key for table t3 which is incorrect. I think instead of using hash-key we could use ref access on key (c3) here instead. Also by not being able to do the lookup the ON condition is not evaluated (c3= c2) and hence all rows pass the where clause and we return more results.
Varun Gupta (Inactive)
added a comment - - edited OK the plans look similar but they are not exactly the same, lets analyze the output of ANALYZE FORMAT=JSON for the queries with different join_buffer_size.
When we have a smaller join_buffer_size in that case we disallow using join buffering for a table (t3 here)
Output of ANALYZE with join_buffer_size= 2097152
ANALYZE FORMAT=JSON
SELECT * FROM t1, t2 LEFT JOIN t3 ON (c3 = c2)
WHERE ( i2, pk1 ) IN ( SELECT i3, i4 FROM t3 A, t4 B);
ANALYZE
{
"query_block": {
"select_id": 1,
"r_loops": 1,
"r_total_time_ms": 0.4434,
"table": {
"table_name": "B",
"access_type": "system",
"r_loops": 0,
"rows": 1,
"r_rows": null,
"filtered": 100,
"r_filtered": null
},
"table": {
"table_name": "t1",
"access_type": "const",
"possible_keys": ["PRIMARY"],
"key": "PRIMARY",
"key_length": "4",
"used_key_parts": ["pk1"],
"ref": ["const"],
"r_loops": 0,
"rows": 1,
"r_rows": null,
"filtered": 100,
"r_filtered": null
},
"table": {
"table_name": "t2",
"access_type": "ALL",
"r_loops": 1,
"rows": 2,
"r_rows": 2,
"r_total_time_ms": 0.0201,
"filtered": 100,
"r_filtered": 100,
"attached_condition": "t2.i2 is not null"
},
"duplicates_removal": {
"block-nl-join": {
"table": {
"table_name": "A",
"access_type": "hash_index",
"possible_keys": ["i3"],
"key": "#hash#i3:i3",
"key_length": "5:5",
"used_key_parts": ["i3"],
"ref": ["test.t2.i2"],
"r_loops": 1,
"rows": 3,
"r_rows": 3,
"r_total_time_ms": 0.055,
"filtered": 66.667,
"r_filtered": 100,
"using_index": true
},
"buffer_type": "flat",
"buffer_size": "2048Kb",
"join_type": "BNLH",
"r_filtered": 100
}
},
"table": {
"table_name": "t3",
"access_type": "hash_ALL",
"possible_keys": ["c3"],
"key": "#hash#c3",
"key_length": "2",
"used_key_parts": ["c3"],
"ref": ["test.t2.c2"],
"r_loops": 2,
"rows": 3,
"r_rows": 3,
"r_total_time_ms": 0.0289,
"filtered": 33.333,
"r_filtered": 100,
"attached_condition": "trigcond(trigcond(t2.c2 is not null))"
}
}
}
Output for ANALYZE FORMAT=JSON with join_buffer_size= 262144 (gives correct result)
ANALYZE FORMAT=JSON
SELECT * FROM t1, t2 LEFT JOIN t3 ON (c3 = c2)
WHERE ( i2, pk1 ) IN ( SELECT i3, i4 FROM t3 A, t4 B);
ANALYZE
{
"query_block": {
"select_id": 1,
"r_loops": 1,
"r_total_time_ms": 0.3191,
"table": {
"table_name": "B",
"access_type": "system",
"r_loops": 0,
"rows": 1,
"r_rows": null,
"filtered": 100,
"r_filtered": null
},
"table": {
"table_name": "t1",
"access_type": "const",
"possible_keys": ["PRIMARY"],
"key": "PRIMARY",
"key_length": "4",
"used_key_parts": ["pk1"],
"ref": ["const"],
"r_loops": 0,
"rows": 1,
"r_rows": null,
"filtered": 100,
"r_filtered": null
},
"table": {
"table_name": "t2",
"access_type": "ALL",
"r_loops": 1,
"rows": 2,
"r_rows": 2,
"r_total_time_ms": 0.0172,
"filtered": 100,
"r_filtered": 100,
"attached_condition": "t2.i2 is not null"
},
"duplicates_removal": {
"block-nl-join": {
"table": {
"table_name": "A",
"access_type": "hash_index",
"possible_keys": ["i3"],
"key": "#hash#i3:i3",
"key_length": "5:5",
"used_key_parts": ["i3"],
"ref": ["test.t2.i2"],
"r_loops": 1,
"rows": 3,
"r_rows": 3,
"r_total_time_ms": 0.0422,
"filtered": 66.667,
"r_filtered": 100,
"using_index": true
},
"buffer_type": "flat",
"buffer_size": "256Kb",
"join_type": "BNLH",
"r_filtered": 100
}
},
"block-nl-join": {
"table": {
"table_name": "t3",
"access_type": "hash_ALL",
"possible_keys": ["c3"],
"key": "#hash#c3",
"key_length": "2",
"used_key_parts": ["c3"],
"ref": ["test.t2.c2"],
"r_loops": 1,
"rows": 3,
"r_rows": 3,
"r_total_time_ms": 0.0132,
"filtered": 33.333,
"r_filtered": 100
},
"buffer_type": "flat",
"buffer_size": "256Kb",
"join_type": "BNLH",
"attached_condition": "trigcond(trigcond(t2.c2 is not null))",
"r_filtered": null
}
}
}
The problem with the first output (with smaller join buffer), it is still using hash key for table t3 which is incorrect. I think instead of using hash-key we could use ref access on key (c3) here instead. Also by not being able to do the lookup the ON condition is not evaluated (c3= c2) and hence all rows pass the where clause and we return more results.
(also, do not forget to replace an assert with assignment, as discussed on Slack)
Sergei Petrunia
added a comment - Review: https://lists.launchpad.net/maria-developers/msg12491.html
(also, do not forget to replace an assert with assignment, as discussed on Slack)
If run the query with semijoin=off, I get two rows. I assume two rows is the correct output.
The EXPLAIN is the same with both join_buffer_size settings:
+------+-------------+-------+------------+---------------+-------------+---------+----------+------+----------------------------------------------------------------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+------+-------------+-------+------------+---------------+-------------+---------+----------+------+----------------------------------------------------------------------------------+
| 1 | PRIMARY | t4 | system | NULL | NULL | NULL | NULL | 1 | Using temporary; Using filesort |
| 1 | PRIMARY | t1 | const | PRIMARY | PRIMARY | 4 | const | 1 | |
| 1 | PRIMARY | t2 | ALL | NULL | NULL | NULL | NULL | 2 | Using where |
| 1 | PRIMARY | t3 | hash_index | i3 | #hash#i3:i3 | 5:5 | j2.t2.i2 | 3 | Using index; Start temporary; End temporary; Using join buffer (flat, BNLH join) |
| 1 | PRIMARY | t3 | hash_ALL | c3 | #hash#c3 | 2 | j2.t2.c2 | 3 | Using where; Using join buffer (flat, BNLH join) |
+------+-------------+-------+------------+---------------+-------------+---------+----------+------+----------------------------------------------------------------------------------+