Details
-
Bug
-
Status: Closed (View Workflow)
-
Blocker
-
Resolution: Fixed
-
6.1.1
-
None
-
2021-9
Description
The issue had been fixed in develop-5 however it wasn't fixed in develop b/c of the upcoming wide-decimal JOINs feature to be merged.
Aggregates are now not affected b/c the code related to aggregate calculation has a separate hash function for Row class. The affected features are: DISTINCT, UNION and typeless JOIN that either use Row::hash() method or duplicate its code.
Here are the evidences.
The stats running the query query
select * from (select distinct l_orderkey from (select l_orderkey from lineitem limit 5000000)s1)s2 limit 100;
|
without the patch:
Latency (ms):
|
min: 5380.21
|
avg: 6020.17
|
max: 6507.55
|
95th percentile: 6476.48
|
and with the patch a
Latency (ms):
|
min: 556.78
|
avg: 572.92
|
max: 603.67
|
95th percentile: 601.29
|
So there is 10x degradation in timings running distinct.
diff --git a/utils/common/collation.h b/utils/common/collation.h
|
index 8d4306213..1152b8baf 100644
|
--- a/utils/common/collation.h
|
+++ b/utils/common/collation.h
|
@@ -109,11 +109,14 @@ namespace datatypes |
|
class MariaDBHasher |
{
|
+ static const ulong mPart1DefValue = 1; |
+ static const ulong mPart2DefValue = 4; |
+
|
ulong mPart1;
|
ulong mPart2;
|
public: |
MariaDBHasher()
|
- :mPart1(1), mPart2(4)
|
+ :mPart1(mPart1DefValue), mPart2(mPart2DefValue)
|
{ }
|
MariaDBHasher & add(CHARSET_INFO * cs, const char *str, size_t length) |
{
|
@@ -124,9 +127,13 @@ public: |
{
|
return add(cs, str.str(), str.length()); |
}
|
- uint32_t finalize() const |
+ uint64_t finalize() const |
+ {
|
+ return mPart1; |
+ }
|
+ bool wasUsed() const |
{
|
- return (uint32_t) mPart1; |
+ return mPart1 != mPart1DefValue || mPart2 != mPart2DefValue; |
}
|
};
|
|
diff --git a/utils/common/hasher.h b/utils/common/hasher.h
|
index 1633986a2..3f761b6bb 100644
|
--- a/utils/common/hasher.h
|
+++ b/utils/common/hasher.h
|
@@ -402,7 +402,6 @@ private: |
uint32_t fCmpLen;
|
};
|
|
-
|
}
|
|
#endif // UTILS_HASHER_H
|
diff --git a/utils/common/hashfamily.h b/utils/common/hashfamily.h
|
new file mode 100644 |
index 000000000..ef2f3ec25
|
--- /dev/null
|
+++ b/utils/common/hashfamily.h
|
@@ -0,0 +1,60 @@
|
+/* Copyright (C) 2021 Mariadb Corporation. |
+
|
+ This program is free software; you can redistribute it and/or
|
+ modify it under the terms of the GNU General Public License
|
+ as published by the Free Software Foundation; version 2 of
|
+ the License.
|
+
|
+ This program is distributed in the hope that it will be useful,
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
+ GNU General Public License for more details.
|
+
|
+ You should have received a copy of the GNU General Public License
|
+ along with this program; if not, write to the Free Software
|
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
+ MA 02110-1301, USA. */
|
+
|
+#ifndef UTILS_HASHFAMILY_H
|
+#define UTILS_HASHFAMILY_H
|
+
|
+#include "hasher.h" |
+#include "collation.h" |
+
|
+namespace utils |
+{
|
+
|
+class HashFamily |
+{
|
+ public: |
+ HashFamily(const utils::Hasher_r& h, |
+ const uint64_t intermediateHash, |
+ const uint64_t len, |
+ const datatypes::MariaDBHasher& hM) : mHasher(h), |
+ mMariaDBHasher(hM),
|
+ mHasher_rHash(intermediateHash),
|
+ mHasher_rLen(len)
|
+ { }
|
+
|
+ // Algorithm, seed and factor are taken from this discussion |
+ // https://stackoverflow.com/questions/1646807/quick-and-simple-hash-code-combinations |
+ inline uint64_t finalize() const |
+ {
|
+ // return (mMariaDBHasher.wasUsed()) ? (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize() |
+ // : mHasher.finalize(mHasher_rHash, mHasher_rLen); |
+ return (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize(); |
+
|
+ }
|
+ private: |
+ constexpr static uint64_t seed = 1009ULL; |
+ constexpr static uint64_t factor = 9176ULL; |
+
|
+ const utils::Hasher_r& mHasher; |
+ const datatypes::MariaDBHasher& mMariaDBHasher; |
+ const uint64_t mHasher_rHash; |
+ const uint32_t mHasher_rLen; |
+};
|
+
|
+}
|
+#endif
|
+// vim:ts=2 sw=2: |
diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h
|
index ad4e78067..463c50b73 100644
|
--- a/utils/rowgroup/rowgroup.h
|
+++ b/utils/rowgroup/rowgroup.h
|
@@ -60,6 +60,7 @@
|
#include "../winport/winport.h"
|
|
#include "collation.h"
|
+#include "common/hashfamily.h" |
|
|
// Workaround for my_global.h #define of isnan(X) causing a std::std namespace |
@@ -558,7 +559,10 @@ public: |
// a fcn to check the type defs seperately doesn't exist yet. No normalization. |
inline uint64_t hash(uint32_t lastCol) const; // generates a hash for cols [0-lastCol] |
inline uint64_t hash() const; // generates a hash for all cols |
- inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const; |
+ inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM, |
+ const utils::Hasher_r& h, |
+ const uint32_t col, |
+ uint32_t& intermediateHash) const; |
inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t col) const; |
inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols) const |
{
|
@@ -930,7 +934,10 @@ inline utils::ConstString Row::getConstString(uint32_t colIndex) const |
}
|
|
|
-inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t col) const |
+inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM, |
+ const utils::Hasher_r& h, |
+ const uint32_t col, |
+ uint32_t& intermediateHash) const |
{
|
switch (getColType(col)) |
{
|
@@ -940,12 +947,14 @@ inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t co |
case execplan::CalpontSystemCatalog::TEXT: |
{
|
CHARSET_INFO *cs = getCharset(col);
|
- h.add(cs, getConstString(col));
|
+ hM.add(cs, getConstString(col));
|
break; |
}
|
default: |
- h.add(&my_charset_bin, getShortConstString(col));
|
+ {
|
+ intermediateHash = h((const char*) &data[offsets[col]], colWidths[col], intermediateHash); |
break; |
+ }
|
}
|
}
|
|
@@ -1417,17 +1426,21 @@ inline uint64_t Row::hash() const |
|
inline uint64_t Row::hash(uint32_t lastCol) const |
{
|
- datatypes::MariaDBHasher h;
|
-
|
+ // Use two hash classes. MariaDBHasher for text-based |
+ // collation-aware data types and Hasher_r for all other data types. |
+ // We deliver a hash that is a combination of both hashers' results. |
+ utils::Hasher_r h;
|
+ datatypes::MariaDBHasher hM;
|
+ uint32_t intermediateHash = 0;
|
// Sometimes we ask this to hash 0 bytes, and it comes through looking like |
// lastCol = -1. Return 0. |
if (lastCol >= columnCount) |
return 0; |
|
for (uint32_t i = 0; i <= lastCol; i++) |
- colUpdateMariaDBHasher(h, i);
|
+ colUpdateMariaDBHasher(hM, h, i, intermediateHash);
|
|
- return h.finalize(); |
+ return utils::HashFamily(h, intermediateHash, lastCol << 2, hM).finalize(); |
}
|
|
inline bool Row::equals(const Row& r2) const |
root@ip-172-31-3-254:/data/mdb-server/storage/columnstore/columnstore# git --no-pager diff HEAD~
|
diff --git a/utils/common/collation.h b/utils/common/collation.h
|
index 8d4306213..1152b8baf 100644
|
--- a/utils/common/collation.h
|
+++ b/utils/common/collation.h
|
@@ -109,11 +109,14 @@ namespace datatypes |
|
class MariaDBHasher |
{
|
+ static const ulong mPart1DefValue = 1; |
+ static const ulong mPart2DefValue = 4; |
+
|
ulong mPart1;
|
ulong mPart2;
|
public: |
MariaDBHasher()
|
- :mPart1(1), mPart2(4)
|
+ :mPart1(mPart1DefValue), mPart2(mPart2DefValue)
|
{ }
|
MariaDBHasher & add(CHARSET_INFO * cs, const char *str, size_t length) |
{
|
@@ -124,9 +127,13 @@ public: |
{
|
return add(cs, str.str(), str.length()); |
}
|
- uint32_t finalize() const |
+ uint64_t finalize() const |
+ {
|
+ return mPart1; |
+ }
|
+ bool wasUsed() const |
{
|
- return (uint32_t) mPart1; |
+ return mPart1 != mPart1DefValue || mPart2 != mPart2DefValue; |
}
|
};
|
|
diff --git a/utils/common/hasher.h b/utils/common/hasher.h
|
index 1633986a2..3f761b6bb 100644
|
--- a/utils/common/hasher.h
|
+++ b/utils/common/hasher.h
|
@@ -402,7 +402,6 @@ private: |
uint32_t fCmpLen;
|
};
|
|
-
|
}
|
|
#endif // UTILS_HASHER_H
|
diff --git a/utils/common/hashfamily.h b/utils/common/hashfamily.h
|
new file mode 100644 |
index 000000000..ef2f3ec25
|
--- /dev/null
|
+++ b/utils/common/hashfamily.h
|
@@ -0,0 +1,60 @@
|
+/* Copyright (C) 2021 Mariadb Corporation. |
+
|
+ This program is free software; you can redistribute it and/or
|
+ modify it under the terms of the GNU General Public License
|
+ as published by the Free Software Foundation; version 2 of
|
+ the License.
|
+
|
+ This program is distributed in the hope that it will be useful,
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
+ GNU General Public License for more details.
|
+
|
+ You should have received a copy of the GNU General Public License
|
+ along with this program; if not, write to the Free Software
|
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
+ MA 02110-1301, USA. */
|
+
|
+#ifndef UTILS_HASHFAMILY_H
|
+#define UTILS_HASHFAMILY_H
|
+
|
+#include "hasher.h" |
+#include "collation.h" |
+
|
+namespace utils |
+{
|
+
|
+class HashFamily |
+{
|
+ public: |
+ HashFamily(const utils::Hasher_r& h, |
+ const uint64_t intermediateHash, |
+ const uint64_t len, |
+ const datatypes::MariaDBHasher& hM) : mHasher(h), |
+ mMariaDBHasher(hM),
|
+ mHasher_rHash(intermediateHash),
|
+ mHasher_rLen(len)
|
+ { }
|
+
|
+ // Algorithm, seed and factor are taken from this discussion |
+ // https://stackoverflow.com/questions/1646807/quick-and-simple-hash-code-combinations |
+ inline uint64_t finalize() const |
+ {
|
+ // return (mMariaDBHasher.wasUsed()) ? (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize() |
+ // : mHasher.finalize(mHasher_rHash, mHasher_rLen); |
+ return (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize(); |
+
|
+ }
|
+ private: |
+ constexpr static uint64_t seed = 1009ULL; |
+ constexpr static uint64_t factor = 9176ULL; |
+
|
+ const utils::Hasher_r& mHasher; |
+ const datatypes::MariaDBHasher& mMariaDBHasher; |
+ const uint64_t mHasher_rHash; |
+ const uint32_t mHasher_rLen; |
+};
|
+
|
+}
|
+#endif
|
+// vim:ts=2 sw=2: |
diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h
|
index ad4e78067..463c50b73 100644
|
--- a/utils/rowgroup/rowgroup.h
|
+++ b/utils/rowgroup/rowgroup.h
|
@@ -60,6 +60,7 @@
|
#include "../winport/winport.h"
|
|
#include "collation.h"
|
+#include "common/hashfamily.h" |
|
|
// Workaround for my_global.h #define of isnan(X) causing a std::std namespace |
@@ -558,7 +559,10 @@ public: |
// a fcn to check the type defs seperately doesn't exist yet. No normalization. |
inline uint64_t hash(uint32_t lastCol) const; // generates a hash for cols [0-lastCol] |
inline uint64_t hash() const; // generates a hash for all cols |
- inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const; |
+ inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM, |
+ const utils::Hasher_r& h, |
+ const uint32_t col, |
+ uint32_t& intermediateHash) const; |
inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t col) const; |
inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols) const |
{
|
@@ -930,7 +934,10 @@ inline utils::ConstString Row::getConstString(uint32_t colIndex) const |
}
|
|
|
-inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t col) const |
+inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM, |
+ const utils::Hasher_r& h, |
+ const uint32_t col, |
+ uint32_t& intermediateHash) const |
{
|
switch (getColType(col)) |
{
|
@@ -940,12 +947,14 @@ inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t co |
case execplan::CalpontSystemCatalog::TEXT: |
{
|
CHARSET_INFO *cs = getCharset(col);
|
- h.add(cs, getConstString(col));
|
+ hM.add(cs, getConstString(col));
|
break; |
}
|
default: |
- h.add(&my_charset_bin, getShortConstString(col));
|
+ {
|
+ intermediateHash = h((const char*) &data[offsets[col]], colWidths[col], intermediateHash); |
break; |
+ }
|
}
|
}
|
|
@@ -1417,17 +1426,21 @@ inline uint64_t Row::hash() const |
|
inline uint64_t Row::hash(uint32_t lastCol) const |
{
|
- datatypes::MariaDBHasher h;
|
-
|
+ // Use two hash classes. MariaDBHasher for text-based |
+ // collation-aware data types and Hasher_r for all other data types. |
+ // We deliver a hash that is a combination of both hashers' results. |
+ utils::Hasher_r h;
|
+ datatypes::MariaDBHasher hM;
|
+ uint32_t intermediateHash = 0;
|
// Sometimes we ask this to hash 0 bytes, and it comes through looking like |
// lastCol = -1. Return 0. |
if (lastCol >= columnCount) |
return 0; |
|
for (uint32_t i = 0; i <= lastCol; i++) |
- colUpdateMariaDBHasher(h, i);
|
+ colUpdateMariaDBHasher(hM, h, i, intermediateHash);
|
|
- return h.finalize(); |
+ return utils::HashFamily(h, intermediateHash, lastCol << 2, hM).finalize(); |
}
|
|
inline bool Row::equals(const Row& r2) const |