Improve performance of GROUP BY multiple fixed size keys

2024-11-10 01:25:21 +00:00 · 2021-02-14 01:56:04 +03:00 · 2021-02-14 01:56:04 +03:00 · 5bdc570046
commit 5bdc570046
parent 024d8491d6
4 changed files with 116 additions and 2 deletions
--- a/src/Common/ColumnsHashing.h
+++ b/src/Common/ColumnsHashing.h
@ -455,7 +455,14 @@ template <>
 struct LowCardinalityKeys<false> {};

 /// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
-template <typename Value, typename Key, typename Mapped, bool has_nullable_keys_ = false, bool has_low_cardinality_ = false, bool use_cache = true, bool need_offset = false>
+template <
+    typename Value,
+    typename Key,
+    typename Mapped,
+    bool has_nullable_keys_ = false,
+    bool has_low_cardinality_ = false,
+    bool use_cache = true,
+    bool need_offset = false>
 struct HashMethodKeysFixed
    : private columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>
    , public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
@ -471,6 +478,12 @@ struct HashMethodKeysFixed
    Sizes key_sizes;
    size_t keys_size;

+    /// SSSE3 shuffle method can be used. Shuffle masks will be calculated and stored here.
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+    std::unique_ptr<uint8_t[]> masks;
+    std::unique_ptr<const char*[]> columns_data;
+#endif
+
    HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &)
        : Base(key_columns), key_sizes(std::move(key_sizes_)), keys_size(key_columns.size())
    {
@ -491,6 +504,58 @@ struct HashMethodKeysFixed
                    low_cardinality_keys.nested_columns[i] = key_columns[i];
            }
        }
+
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+        if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
+        {
+            /** The task is to "pack" multiple fixed-size fields into single larger Key.
+              * Example: pack UInt8, UInt32, UInt16, UInt64 into UInt128 key:
+              * [- ---- -- -------- -] - the resulting uint128 key
+              *  ^  ^   ^   ^       ^
+              *  u8 u32 u16 u64    zero
+              *
+              * We can do it with the help of SSSE3 shuffle instruction.
+              *
+              * There will be a mask for every GROUP BY element (keys_size masks in total).
+              * Every mask has 16 bytes but only sizeof(Key) bytes are used (other we don't care).
+              *
+              * Every byte in the mask has the following meaning:
+              * - if it is 0..15, take the element at this index from source register and place here in the result;
+              * - if it is 0xFF - set the elemend in the result to zero.
+              *
+              * Example:
+              * We want to copy UInt32 to offset 1 in the destination and set other bytes in the destination as zero.
+              * The corresponding mask will be: FF, 0, 1, 2, 3, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF
+              *
+              * The max size of destination is 16 bytes, because we cannot process more with SSSE3.
+              *
+              * The method is disabled under MSan, because it's allowed
+              * to load into SSE register and process up to 15 bytes of uninitialized memory in columns padding.
+              * We don't use this uninitialized memory but MSan cannot look "into" the shuffle instruction.
+              *
+              * 16-bytes masks can be placed overlapping, only first sizeof(Key) bytes are relevant in each mask.
+              * We initialize them to 0xFF and then set the needed elements.
+              */
+            size_t total_masks_size = sizeof(Key) * keys_size + (16 - sizeof(Key));
+            masks.reset(new uint8_t[total_masks_size]);
+            memset(masks.get(), 0xFF, total_masks_size);
+
+            size_t offset = 0;
+            for (size_t i = 0; i < keys_size; ++i)
+            {
+                for (size_t j = 0; j < key_sizes[i]; ++j)
+                {
+                    masks[i * sizeof(Key) + offset] = j;
+                    ++offset;
+                }
+            }
+
+            columns_data.reset(new const char*[keys_size]);
+
+            for (size_t i = 0; i < keys_size; ++i)
+                columns_data[i] = Base::getActualColumns()[i]->getRawData().data;
+        }
+#endif
    }

    ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
@ -506,6 +571,10 @@ struct HashMethodKeysFixed
                return packFixed<Key, true>(row, keys_size, low_cardinality_keys.nested_columns, key_sizes,
                                            &low_cardinality_keys.positions, &low_cardinality_keys.position_sizes);

+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+            if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
+                return packFixedShuffle<Key>(columns_data.get(), keys_size, key_sizes.data(), row, masks.get());
+#endif
            return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes);
        }
    }
--- a/src/Interpreters/AggregationCommon.h
+++ b/src/Interpreters/AggregationCommon.h
@ -15,6 +15,10 @@
 #include <Columns/ColumnFixedString.h>
 #include <Columns/ColumnLowCardinality.h>

+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+#include <tmmintrin.h>
+#endif
+

 template <>
 struct DefaultHash<StringRef> : public StringRefHash {};
@ -255,4 +259,32 @@ static inline StringRef ALWAYS_INLINE serializeKeysToPoolContiguous(
 }


+/** Pack elements with shuffle instruction.
+  * See the explanation in ColumnsHashing.h
+  */
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+template <typename T>
+static T ALWAYS_INLINE packFixedShuffle(
+    const char * __restrict * __restrict srcs,
+    size_t num_srcs,
+    const size_t * __restrict elem_sizes,
+    size_t idx,
+    const uint8_t * __restrict masks)
+{
+    __m128i res{};
+
+    for (size_t i = 0; i < num_srcs; ++i)
+    {
+        res = _mm_xor_si128(res,
+            _mm_shuffle_epi8(
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(srcs[i] + elem_sizes[i] * idx)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(&masks[i * sizeof(T)]))));
+    }
+
+    T out;
+    __builtin_memcpy(&out, &res, sizeof(T));
+    return out;
+}
+#endif
+
 }
--- a/src/Interpreters/Aggregator.h
+++ b/src/Interpreters/Aggregator.h
@ -365,7 +365,13 @@ struct AggregationMethodKeysFixed
    template <typename Other>
    AggregationMethodKeysFixed(const Other & other) : data(other.data) {}

-    using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, Mapped, has_nullable_keys, has_low_cardinality, use_cache>;
+    using State = ColumnsHashing::HashMethodKeysFixed<
+        typename Data::value_type,
+        Key,
+        Mapped,
+        has_nullable_keys,
+        has_low_cardinality,
+        use_cache>;

    static const bool low_cardinality_optimization = false;

--- a/tests/performance/group_by_fixed_keys.xml
+++ b/tests/performance/group_by_fixed_keys.xml
@ -0,0 +1,7 @@
+<test>
+    <query>WITH toUInt8(number) AS k, toUInt64(k) AS k1, k AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
+    <query>WITH toUInt8(number) AS k, toUInt16(k) AS k1, toUInt32(k) AS k2, k AS k3 SELECT k1, k2, k3, count() FROM numbers(100000000) GROUP BY k1, k2, k3</query>
+    <query>WITH toUInt8(number) AS k, k AS k1, k + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
+    <query>WITH toUInt8(number) AS k, k AS k1, k + 1 AS k2, k + 2 AS k3, k + 3 AS k4 SELECT k1, k2, k3, k4, count() FROM numbers(100000000) GROUP BY k1, k2, k3, k4</query>
+    <query>WITH toUInt8(number) AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
+</test>