[MCOL-4759] DISTINCT and UNION performance degradation Created: 2021-06-15  Updated: 2021-06-29  Resolved: 2021-06-29

Status: Closed
Project: MariaDB ColumnStore
Component/s: ExeMgr
Affects Version/s: 6.1.1
Fix Version/s: 6.1.1

Type: Bug Priority: Blocker
Reporter: Roman Assignee: Daniel Lee (Inactive)
Resolution: Fixed Votes: 0
Labels: None

Issue Links:
PartOf
is part of MCOL-4564 Performance issues for DBT3 queries #... Closed
Relates
relates to MCOL-4686 count(DINSTINCT) with a large number ... Closed
relates to MCOL-4760 Unify hash implementation Closed
Sprint: 2021-9

 Description   

The issue had been fixed in develop-5 however it wasn't fixed in develop b/c of the upcoming wide-decimal JOINs feature to be merged.
Aggregates are now not affected b/c the code related to aggregate calculation has a separate hash function for Row class. The affected features are: DISTINCT, UNION and typeless JOIN that either use Row::hash() method or duplicate its code.
Here are the evidences.
The stats running the query query

select * from (select distinct l_orderkey from (select l_orderkey from lineitem limit 5000000)s1)s2 limit 100;

without the patch:

Latency (ms):
         min:                                 5380.21
         avg:                                 6020.17
         max:                                 6507.55
         95th percentile:                     6476.48

and with the patch a

Latency (ms):
         min:                                  556.78
         avg:                                  572.92
         max:                                  603.67
         95th percentile:                      601.29

So there is 10x degradation in timings running distinct.

diff --git a/utils/common/collation.h b/utils/common/collation.h
index 8d4306213..1152b8baf 100644
--- a/utils/common/collation.h
+++ b/utils/common/collation.h
@@ -109,11 +109,14 @@ namespace datatypes
 
 class MariaDBHasher
 {
+    static const ulong mPart1DefValue = 1;
+    static const ulong mPart2DefValue = 4;
+
     ulong mPart1;
     ulong mPart2;
 public:
     MariaDBHasher()
-        :mPart1(1), mPart2(4)
+        :mPart1(mPart1DefValue), mPart2(mPart2DefValue)
     { }
     MariaDBHasher & add(CHARSET_INFO * cs, const char *str, size_t length)
     {
@@ -124,9 +127,13 @@ public:
     {
         return add(cs, str.str(), str.length());
     }
-    uint32_t finalize() const
+    uint64_t finalize() const
+    {
+        return mPart1;
+    }
+    bool wasUsed() const
     {
-        return (uint32_t) mPart1;
+        return mPart1 != mPart1DefValue || mPart2 != mPart2DefValue;
     }
 };
 
diff --git a/utils/common/hasher.h b/utils/common/hasher.h
index 1633986a2..3f761b6bb 100644
--- a/utils/common/hasher.h
+++ b/utils/common/hasher.h
@@ -402,7 +402,6 @@ private:
     uint32_t fCmpLen;
 };
 
-
 }
 
 #endif  // UTILS_HASHER_H
diff --git a/utils/common/hashfamily.h b/utils/common/hashfamily.h
new file mode 100644
index 000000000..ef2f3ec25
--- /dev/null
+++ b/utils/common/hashfamily.h
@@ -0,0 +1,60 @@
+/* Copyright (C) 2021 Mariadb Corporation.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#ifndef UTILS_HASHFAMILY_H
+#define UTILS_HASHFAMILY_H
+
+#include "hasher.h"
+#include "collation.h"
+
+namespace utils
+{
+
+class HashFamily 
+{
+  public:
+    HashFamily(const utils::Hasher_r& h,
+      const uint64_t intermediateHash,
+      const uint64_t len,
+      const datatypes::MariaDBHasher& hM) : mHasher(h),
+                                            mMariaDBHasher(hM),
+                                            mHasher_rHash(intermediateHash),
+                                            mHasher_rLen(len)
+    { }
+
+    // Algorithm, seed and factor are taken from this discussion
+    // https://stackoverflow.com/questions/1646807/quick-and-simple-hash-code-combinations
+    inline uint64_t finalize() const
+    {
+     // return (mMariaDBHasher.wasUsed()) ? (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize()
+     //                                   : mHasher.finalize(mHasher_rHash, mHasher_rLen);
+      return (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize();
+
+    }
+  private:
+    constexpr static uint64_t seed = 1009ULL;
+    constexpr static uint64_t factor = 9176ULL;
+
+    const utils::Hasher_r& mHasher;
+    const datatypes::MariaDBHasher& mMariaDBHasher;
+    const uint64_t mHasher_rHash;
+    const uint32_t mHasher_rLen;
+};
+
+}
+#endif
+// vim:ts=2 sw=2:
diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h
index ad4e78067..463c50b73 100644
--- a/utils/rowgroup/rowgroup.h
+++ b/utils/rowgroup/rowgroup.h
@@ -60,6 +60,7 @@
 #include "../winport/winport.h"
 
 #include "collation.h"
+#include "common/hashfamily.h"
 
 
 // Workaround for my_global.h #define of isnan(X) causing a std::std namespace
@@ -558,7 +559,10 @@ public:
     // a fcn to check the type defs seperately doesn't exist yet.  No normalization.
     inline uint64_t hash(uint32_t lastCol) const;  // generates a hash for cols [0-lastCol]
     inline uint64_t hash() const;  // generates a hash for all cols
-    inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const;
+    inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM,
+                                       const utils::Hasher_r& h,
+                                       const uint32_t col,
+                                       uint32_t& intermediateHash) const;
     inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t col) const;
     inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols) const
     {
@@ -930,7 +934,10 @@ inline utils::ConstString Row::getConstString(uint32_t colIndex) const
 }
 
 
-inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t col) const
+inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM,
+                                        const utils::Hasher_r& h,
+                                        const uint32_t col,
+                                        uint32_t& intermediateHash) const
 {
     switch (getColType(col))
     {
@@ -940,12 +947,14 @@ inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t co
         case execplan::CalpontSystemCatalog::TEXT:
         {
             CHARSET_INFO *cs = getCharset(col);
-            h.add(cs, getConstString(col));
+            hM.add(cs, getConstString(col));
             break;
         }
         default:
-            h.add(&my_charset_bin, getShortConstString(col));
+        {
+            intermediateHash = h((const char*) &data[offsets[col]], colWidths[col], intermediateHash);
             break;
+        }
     }
 }
 
@@ -1417,17 +1426,21 @@ inline uint64_t Row::hash() const
 
 inline uint64_t Row::hash(uint32_t lastCol) const
 {
-    datatypes::MariaDBHasher h;
-
+    // Use two hash classes. MariaDBHasher for text-based
+    // collation-aware data types and Hasher_r for all other data types.
+    // We deliver a hash that is a combination of both hashers' results.
+    utils::Hasher_r h;
+    datatypes::MariaDBHasher hM;
+    uint32_t intermediateHash = 0;
     // Sometimes we ask this to hash 0 bytes, and it comes through looking like
     // lastCol = -1.  Return 0.
     if (lastCol >= columnCount)
         return 0;
 
     for (uint32_t i = 0; i <= lastCol; i++)
-      colUpdateMariaDBHasher(h, i);
+        colUpdateMariaDBHasher(hM, h, i, intermediateHash);
 
-    return h.finalize();
+    return utils::HashFamily(h, intermediateHash, lastCol << 2, hM).finalize();
 }
 
 inline bool Row::equals(const Row& r2) const
root@ip-172-31-3-254:/data/mdb-server/storage/columnstore/columnstore# git --no-pager diff HEAD~
diff --git a/utils/common/collation.h b/utils/common/collation.h
index 8d4306213..1152b8baf 100644
--- a/utils/common/collation.h
+++ b/utils/common/collation.h
@@ -109,11 +109,14 @@ namespace datatypes
 
 class MariaDBHasher
 {
+    static const ulong mPart1DefValue = 1;
+    static const ulong mPart2DefValue = 4;
+
     ulong mPart1;
     ulong mPart2;
 public:
     MariaDBHasher()
-        :mPart1(1), mPart2(4)
+        :mPart1(mPart1DefValue), mPart2(mPart2DefValue)
     { }
     MariaDBHasher & add(CHARSET_INFO * cs, const char *str, size_t length)
     {
@@ -124,9 +127,13 @@ public:
     {
         return add(cs, str.str(), str.length());
     }
-    uint32_t finalize() const
+    uint64_t finalize() const
+    {
+        return mPart1;
+    }
+    bool wasUsed() const
     {
-        return (uint32_t) mPart1;
+        return mPart1 != mPart1DefValue || mPart2 != mPart2DefValue;
     }
 };
 
diff --git a/utils/common/hasher.h b/utils/common/hasher.h
index 1633986a2..3f761b6bb 100644
--- a/utils/common/hasher.h
+++ b/utils/common/hasher.h
@@ -402,7 +402,6 @@ private:
     uint32_t fCmpLen;
 };
 
-
 }
 
 #endif  // UTILS_HASHER_H
diff --git a/utils/common/hashfamily.h b/utils/common/hashfamily.h
new file mode 100644
index 000000000..ef2f3ec25
--- /dev/null
+++ b/utils/common/hashfamily.h
@@ -0,0 +1,60 @@
+/* Copyright (C) 2021 Mariadb Corporation.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#ifndef UTILS_HASHFAMILY_H
+#define UTILS_HASHFAMILY_H
+
+#include "hasher.h"
+#include "collation.h"
+
+namespace utils
+{
+
+class HashFamily 
+{
+  public:
+    HashFamily(const utils::Hasher_r& h,
+      const uint64_t intermediateHash,
+      const uint64_t len,
+      const datatypes::MariaDBHasher& hM) : mHasher(h),
+                                            mMariaDBHasher(hM),
+                                            mHasher_rHash(intermediateHash),
+                                            mHasher_rLen(len)
+    { }
+
+    // Algorithm, seed and factor are taken from this discussion
+    // https://stackoverflow.com/questions/1646807/quick-and-simple-hash-code-combinations
+    inline uint64_t finalize() const
+    {
+     // return (mMariaDBHasher.wasUsed()) ? (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize()
+     //                                   : mHasher.finalize(mHasher_rHash, mHasher_rLen);
+      return (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize();
+
+    }
+  private:
+    constexpr static uint64_t seed = 1009ULL;
+    constexpr static uint64_t factor = 9176ULL;
+
+    const utils::Hasher_r& mHasher;
+    const datatypes::MariaDBHasher& mMariaDBHasher;
+    const uint64_t mHasher_rHash;
+    const uint32_t mHasher_rLen;
+};
+
+}
+#endif
+// vim:ts=2 sw=2:
diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h
index ad4e78067..463c50b73 100644
--- a/utils/rowgroup/rowgroup.h
+++ b/utils/rowgroup/rowgroup.h
@@ -60,6 +60,7 @@
 #include "../winport/winport.h"
 
 #include "collation.h"
+#include "common/hashfamily.h"
 
 
 // Workaround for my_global.h #define of isnan(X) causing a std::std namespace
@@ -558,7 +559,10 @@ public:
     // a fcn to check the type defs seperately doesn't exist yet.  No normalization.
     inline uint64_t hash(uint32_t lastCol) const;  // generates a hash for cols [0-lastCol]
     inline uint64_t hash() const;  // generates a hash for all cols
-    inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const;
+    inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM,
+                                       const utils::Hasher_r& h,
+                                       const uint32_t col,
+                                       uint32_t& intermediateHash) const;
     inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t col) const;
     inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols) const
     {
@@ -930,7 +934,10 @@ inline utils::ConstString Row::getConstString(uint32_t colIndex) const
 }
 
 
-inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t col) const
+inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM,
+                                        const utils::Hasher_r& h,
+                                        const uint32_t col,
+                                        uint32_t& intermediateHash) const
 {
     switch (getColType(col))
     {
@@ -940,12 +947,14 @@ inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t co
         case execplan::CalpontSystemCatalog::TEXT:
         {
             CHARSET_INFO *cs = getCharset(col);
-            h.add(cs, getConstString(col));
+            hM.add(cs, getConstString(col));
             break;
         }
         default:
-            h.add(&my_charset_bin, getShortConstString(col));
+        {
+            intermediateHash = h((const char*) &data[offsets[col]], colWidths[col], intermediateHash);
             break;
+        }
     }
 }
 
@@ -1417,17 +1426,21 @@ inline uint64_t Row::hash() const
 
 inline uint64_t Row::hash(uint32_t lastCol) const
 {
-    datatypes::MariaDBHasher h;
-
+    // Use two hash classes. MariaDBHasher for text-based
+    // collation-aware data types and Hasher_r for all other data types.
+    // We deliver a hash that is a combination of both hashers' results.
+    utils::Hasher_r h;
+    datatypes::MariaDBHasher hM;
+    uint32_t intermediateHash = 0;
     // Sometimes we ask this to hash 0 bytes, and it comes through looking like
     // lastCol = -1.  Return 0.
     if (lastCol >= columnCount)
         return 0;
 
     for (uint32_t i = 0; i <= lastCol; i++)
-      colUpdateMariaDBHasher(h, i);
+        colUpdateMariaDBHasher(hM, h, i, intermediateHash);
 
-    return h.finalize();
+    return utils::HashFamily(h, intermediateHash, lastCol << 2, hM).finalize();
 }
 
 inline bool Row::equals(const Row& r2) const



 Comments   
Comment by Roman [ 2021-06-28 ]

4QA There is DISTINCT example in the description.

Comment by Daniel Lee (Inactive) [ 2021-06-29 ]

Build verified: 6.1.1 ( Drone #2669 )

Testing the query on a 10gb dbt3 database. Timing is comparable to that in 5.5.2.

Generated at Thu Feb 08 02:52:46 UTC 2024 using Jira 8.20.16#820016-sha1:9d11dbea5f4be3d4cc21f03a88dd11d8c8687422.