Uploaded image for project: 'MariaDB ColumnStore'
  1. MariaDB ColumnStore
  2. MCOL-4759

DISTINCT and UNION performance degradation

    XMLWordPrintable

    Details

    • Type: Bug
    • Status: Closed (View Workflow)
    • Priority: Blocker
    • Resolution: Fixed
    • Affects Version/s: 6.1.1
    • Fix Version/s: 6.1.1
    • Component/s: ExeMgr
    • Labels:
      None
    • Sprint:
      2021-9

      Description

      The issue had been fixed in develop-5 however it wasn't fixed in develop b/c of the upcoming wide-decimal JOINs feature to be merged.
      Aggregates are now not affected b/c the code related to aggregate calculation has a separate hash function for Row class. The affected features are: DISTINCT, UNION and typeless JOIN that either use Row::hash() method or duplicate its code.
      Here are the evidences.
      The stats running the query query

      select * from (select distinct l_orderkey from (select l_orderkey from lineitem limit 5000000)s1)s2 limit 100;
      

      without the patch:

      Latency (ms):
               min:                                 5380.21
               avg:                                 6020.17
               max:                                 6507.55
               95th percentile:                     6476.48
      

      and with the patch a

      Latency (ms):
               min:                                  556.78
               avg:                                  572.92
               max:                                  603.67
               95th percentile:                      601.29
      

      So there is 10x degradation in timings running distinct.

      diff --git a/utils/common/collation.h b/utils/common/collation.h
      index 8d4306213..1152b8baf 100644
      --- a/utils/common/collation.h
      +++ b/utils/common/collation.h
      @@ -109,11 +109,14 @@ namespace datatypes
       
       class MariaDBHasher
       {
      +    static const ulong mPart1DefValue = 1;
      +    static const ulong mPart2DefValue = 4;
      +
           ulong mPart1;
           ulong mPart2;
       public:
           MariaDBHasher()
      -        :mPart1(1), mPart2(4)
      +        :mPart1(mPart1DefValue), mPart2(mPart2DefValue)
           { }
           MariaDBHasher & add(CHARSET_INFO * cs, const char *str, size_t length)
           {
      @@ -124,9 +127,13 @@ public:
           {
               return add(cs, str.str(), str.length());
           }
      -    uint32_t finalize() const
      +    uint64_t finalize() const
      +    {
      +        return mPart1;
      +    }
      +    bool wasUsed() const
           {
      -        return (uint32_t) mPart1;
      +        return mPart1 != mPart1DefValue || mPart2 != mPart2DefValue;
           }
       };
       
      diff --git a/utils/common/hasher.h b/utils/common/hasher.h
      index 1633986a2..3f761b6bb 100644
      --- a/utils/common/hasher.h
      +++ b/utils/common/hasher.h
      @@ -402,7 +402,6 @@ private:
           uint32_t fCmpLen;
       };
       
      -
       }
       
       #endif  // UTILS_HASHER_H
      diff --git a/utils/common/hashfamily.h b/utils/common/hashfamily.h
      new file mode 100644
      index 000000000..ef2f3ec25
      --- /dev/null
      +++ b/utils/common/hashfamily.h
      @@ -0,0 +1,60 @@
      +/* Copyright (C) 2021 Mariadb Corporation.
      +
      +   This program is free software; you can redistribute it and/or
      +   modify it under the terms of the GNU General Public License
      +   as published by the Free Software Foundation; version 2 of
      +   the License.
      +
      +   This program is distributed in the hope that it will be useful,
      +   but WITHOUT ANY WARRANTY; without even the implied warranty of
      +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      +   GNU General Public License for more details.
      +
      +   You should have received a copy of the GNU General Public License
      +   along with this program; if not, write to the Free Software
      +   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
      +   MA 02110-1301, USA. */
      +
      +#ifndef UTILS_HASHFAMILY_H
      +#define UTILS_HASHFAMILY_H
      +
      +#include "hasher.h"
      +#include "collation.h"
      +
      +namespace utils
      +{
      +
      +class HashFamily 
      +{
      +  public:
      +    HashFamily(const utils::Hasher_r& h,
      +      const uint64_t intermediateHash,
      +      const uint64_t len,
      +      const datatypes::MariaDBHasher& hM) : mHasher(h),
      +                                            mMariaDBHasher(hM),
      +                                            mHasher_rHash(intermediateHash),
      +                                            mHasher_rLen(len)
      +    { }
      +
      +    // Algorithm, seed and factor are taken from this discussion
      +    // https://stackoverflow.com/questions/1646807/quick-and-simple-hash-code-combinations
      +    inline uint64_t finalize() const
      +    {
      +     // return (mMariaDBHasher.wasUsed()) ? (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize()
      +     //                                   : mHasher.finalize(mHasher_rHash, mHasher_rLen);
      +      return (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize();
      +
      +    }
      +  private:
      +    constexpr static uint64_t seed = 1009ULL;
      +    constexpr static uint64_t factor = 9176ULL;
      +
      +    const utils::Hasher_r& mHasher;
      +    const datatypes::MariaDBHasher& mMariaDBHasher;
      +    const uint64_t mHasher_rHash;
      +    const uint32_t mHasher_rLen;
      +};
      +
      +}
      +#endif
      +// vim:ts=2 sw=2:
      diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h
      index ad4e78067..463c50b73 100644
      --- a/utils/rowgroup/rowgroup.h
      +++ b/utils/rowgroup/rowgroup.h
      @@ -60,6 +60,7 @@
       #include "../winport/winport.h"
       
       #include "collation.h"
      +#include "common/hashfamily.h"
       
       
       // Workaround for my_global.h #define of isnan(X) causing a std::std namespace
      @@ -558,7 +559,10 @@ public:
           // a fcn to check the type defs seperately doesn't exist yet.  No normalization.
           inline uint64_t hash(uint32_t lastCol) const;  // generates a hash for cols [0-lastCol]
           inline uint64_t hash() const;  // generates a hash for all cols
      -    inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const;
      +    inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM,
      +                                       const utils::Hasher_r& h,
      +                                       const uint32_t col,
      +                                       uint32_t& intermediateHash) const;
           inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t col) const;
           inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols) const
           {
      @@ -930,7 +934,10 @@ inline utils::ConstString Row::getConstString(uint32_t colIndex) const
       }
       
       
      -inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t col) const
      +inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM,
      +                                        const utils::Hasher_r& h,
      +                                        const uint32_t col,
      +                                        uint32_t& intermediateHash) const
       {
           switch (getColType(col))
           {
      @@ -940,12 +947,14 @@ inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t co
               case execplan::CalpontSystemCatalog::TEXT:
               {
                   CHARSET_INFO *cs = getCharset(col);
      -            h.add(cs, getConstString(col));
      +            hM.add(cs, getConstString(col));
                   break;
               }
               default:
      -            h.add(&my_charset_bin, getShortConstString(col));
      +        {
      +            intermediateHash = h((const char*) &data[offsets[col]], colWidths[col], intermediateHash);
                   break;
      +        }
           }
       }
       
      @@ -1417,17 +1426,21 @@ inline uint64_t Row::hash() const
       
       inline uint64_t Row::hash(uint32_t lastCol) const
       {
      -    datatypes::MariaDBHasher h;
      -
      +    // Use two hash classes. MariaDBHasher for text-based
      +    // collation-aware data types and Hasher_r for all other data types.
      +    // We deliver a hash that is a combination of both hashers' results.
      +    utils::Hasher_r h;
      +    datatypes::MariaDBHasher hM;
      +    uint32_t intermediateHash = 0;
           // Sometimes we ask this to hash 0 bytes, and it comes through looking like
           // lastCol = -1.  Return 0.
           if (lastCol >= columnCount)
               return 0;
       
           for (uint32_t i = 0; i <= lastCol; i++)
      -      colUpdateMariaDBHasher(h, i);
      +        colUpdateMariaDBHasher(hM, h, i, intermediateHash);
       
      -    return h.finalize();
      +    return utils::HashFamily(h, intermediateHash, lastCol << 2, hM).finalize();
       }
       
       inline bool Row::equals(const Row& r2) const
      root@ip-172-31-3-254:/data/mdb-server/storage/columnstore/columnstore# git --no-pager diff HEAD~
      diff --git a/utils/common/collation.h b/utils/common/collation.h
      index 8d4306213..1152b8baf 100644
      --- a/utils/common/collation.h
      +++ b/utils/common/collation.h
      @@ -109,11 +109,14 @@ namespace datatypes
       
       class MariaDBHasher
       {
      +    static const ulong mPart1DefValue = 1;
      +    static const ulong mPart2DefValue = 4;
      +
           ulong mPart1;
           ulong mPart2;
       public:
           MariaDBHasher()
      -        :mPart1(1), mPart2(4)
      +        :mPart1(mPart1DefValue), mPart2(mPart2DefValue)
           { }
           MariaDBHasher & add(CHARSET_INFO * cs, const char *str, size_t length)
           {
      @@ -124,9 +127,13 @@ public:
           {
               return add(cs, str.str(), str.length());
           }
      -    uint32_t finalize() const
      +    uint64_t finalize() const
      +    {
      +        return mPart1;
      +    }
      +    bool wasUsed() const
           {
      -        return (uint32_t) mPart1;
      +        return mPart1 != mPart1DefValue || mPart2 != mPart2DefValue;
           }
       };
       
      diff --git a/utils/common/hasher.h b/utils/common/hasher.h
      index 1633986a2..3f761b6bb 100644
      --- a/utils/common/hasher.h
      +++ b/utils/common/hasher.h
      @@ -402,7 +402,6 @@ private:
           uint32_t fCmpLen;
       };
       
      -
       }
       
       #endif  // UTILS_HASHER_H
      diff --git a/utils/common/hashfamily.h b/utils/common/hashfamily.h
      new file mode 100644
      index 000000000..ef2f3ec25
      --- /dev/null
      +++ b/utils/common/hashfamily.h
      @@ -0,0 +1,60 @@
      +/* Copyright (C) 2021 Mariadb Corporation.
      +
      +   This program is free software; you can redistribute it and/or
      +   modify it under the terms of the GNU General Public License
      +   as published by the Free Software Foundation; version 2 of
      +   the License.
      +
      +   This program is distributed in the hope that it will be useful,
      +   but WITHOUT ANY WARRANTY; without even the implied warranty of
      +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      +   GNU General Public License for more details.
      +
      +   You should have received a copy of the GNU General Public License
      +   along with this program; if not, write to the Free Software
      +   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
      +   MA 02110-1301, USA. */
      +
      +#ifndef UTILS_HASHFAMILY_H
      +#define UTILS_HASHFAMILY_H
      +
      +#include "hasher.h"
      +#include "collation.h"
      +
      +namespace utils
      +{
      +
      +class HashFamily 
      +{
      +  public:
      +    HashFamily(const utils::Hasher_r& h,
      +      const uint64_t intermediateHash,
      +      const uint64_t len,
      +      const datatypes::MariaDBHasher& hM) : mHasher(h),
      +                                            mMariaDBHasher(hM),
      +                                            mHasher_rHash(intermediateHash),
      +                                            mHasher_rLen(len)
      +    { }
      +
      +    // Algorithm, seed and factor are taken from this discussion
      +    // https://stackoverflow.com/questions/1646807/quick-and-simple-hash-code-combinations
      +    inline uint64_t finalize() const
      +    {
      +     // return (mMariaDBHasher.wasUsed()) ? (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize()
      +     //                                   : mHasher.finalize(mHasher_rHash, mHasher_rLen);
      +      return (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize();
      +
      +    }
      +  private:
      +    constexpr static uint64_t seed = 1009ULL;
      +    constexpr static uint64_t factor = 9176ULL;
      +
      +    const utils::Hasher_r& mHasher;
      +    const datatypes::MariaDBHasher& mMariaDBHasher;
      +    const uint64_t mHasher_rHash;
      +    const uint32_t mHasher_rLen;
      +};
      +
      +}
      +#endif
      +// vim:ts=2 sw=2:
      diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h
      index ad4e78067..463c50b73 100644
      --- a/utils/rowgroup/rowgroup.h
      +++ b/utils/rowgroup/rowgroup.h
      @@ -60,6 +60,7 @@
       #include "../winport/winport.h"
       
       #include "collation.h"
      +#include "common/hashfamily.h"
       
       
       // Workaround for my_global.h #define of isnan(X) causing a std::std namespace
      @@ -558,7 +559,10 @@ public:
           // a fcn to check the type defs seperately doesn't exist yet.  No normalization.
           inline uint64_t hash(uint32_t lastCol) const;  // generates a hash for cols [0-lastCol]
           inline uint64_t hash() const;  // generates a hash for all cols
      -    inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const;
      +    inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM,
      +                                       const utils::Hasher_r& h,
      +                                       const uint32_t col,
      +                                       uint32_t& intermediateHash) const;
           inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t col) const;
           inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols) const
           {
      @@ -930,7 +934,10 @@ inline utils::ConstString Row::getConstString(uint32_t colIndex) const
       }
       
       
      -inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t col) const
      +inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &hM,
      +                                        const utils::Hasher_r& h,
      +                                        const uint32_t col,
      +                                        uint32_t& intermediateHash) const
       {
           switch (getColType(col))
           {
      @@ -940,12 +947,14 @@ inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t co
               case execplan::CalpontSystemCatalog::TEXT:
               {
                   CHARSET_INFO *cs = getCharset(col);
      -            h.add(cs, getConstString(col));
      +            hM.add(cs, getConstString(col));
                   break;
               }
               default:
      -            h.add(&my_charset_bin, getShortConstString(col));
      +        {
      +            intermediateHash = h((const char*) &data[offsets[col]], colWidths[col], intermediateHash);
                   break;
      +        }
           }
       }
       
      @@ -1417,17 +1426,21 @@ inline uint64_t Row::hash() const
       
       inline uint64_t Row::hash(uint32_t lastCol) const
       {
      -    datatypes::MariaDBHasher h;
      -
      +    // Use two hash classes. MariaDBHasher for text-based
      +    // collation-aware data types and Hasher_r for all other data types.
      +    // We deliver a hash that is a combination of both hashers' results.
      +    utils::Hasher_r h;
      +    datatypes::MariaDBHasher hM;
      +    uint32_t intermediateHash = 0;
           // Sometimes we ask this to hash 0 bytes, and it comes through looking like
           // lastCol = -1.  Return 0.
           if (lastCol >= columnCount)
               return 0;
       
           for (uint32_t i = 0; i <= lastCol; i++)
      -      colUpdateMariaDBHasher(h, i);
      +        colUpdateMariaDBHasher(hM, h, i, intermediateHash);
       
      -    return h.finalize();
      +    return utils::HashFamily(h, intermediateHash, lastCol << 2, hM).finalize();
       }
       
       inline bool Row::equals(const Row& r2) const
      

        Attachments

          Issue Links

            Activity

              People

              Assignee:
              dleeyh Daniel Lee
              Reporter:
              drrtuy Roman
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

                Dates

                Created:
                Updated:
                Resolved:

                  Git Integration