From 5bdc57004682a5e0236ec630546d20ad752c2fde Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Feb 2021 01:56:04 +0300 Subject: [PATCH] Improve performance of GROUP BY multiple fixed size keys --- src/Common/ColumnsHashing.h | 71 ++++++++++++++++++++++- src/Interpreters/AggregationCommon.h | 32 ++++++++++ src/Interpreters/Aggregator.h | 8 ++- tests/performance/group_by_fixed_keys.xml | 7 +++ 4 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 tests/performance/group_by_fixed_keys.xml diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index b1d25c98955..1ac753fbae5 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -455,7 +455,14 @@ template <> struct LowCardinalityKeys {}; /// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits. -template +template < + typename Value, + typename Key, + typename Mapped, + bool has_nullable_keys_ = false, + bool has_low_cardinality_ = false, + bool use_cache = true, + bool need_offset = false> struct HashMethodKeysFixed : private columns_hashing_impl::BaseStateKeysFixed , public columns_hashing_impl::HashMethodBase, Value, Mapped, use_cache, need_offset> @@ -471,6 +478,12 @@ struct HashMethodKeysFixed Sizes key_sizes; size_t keys_size; + /// SSSE3 shuffle method can be used. Shuffle masks will be calculated and stored here. +#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) + std::unique_ptr masks; + std::unique_ptr columns_data; +#endif + HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &) : Base(key_columns), key_sizes(std::move(key_sizes_)), keys_size(key_columns.size()) { @@ -491,6 +504,58 @@ struct HashMethodKeysFixed low_cardinality_keys.nested_columns[i] = key_columns[i]; } } + +#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) + if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16) + { + /** The task is to "pack" multiple fixed-size fields into single larger Key. + * Example: pack UInt8, UInt32, UInt16, UInt64 into UInt128 key: + * [- ---- -- -------- -] - the resulting uint128 key + * ^ ^ ^ ^ ^ + * u8 u32 u16 u64 zero + * + * We can do it with the help of SSSE3 shuffle instruction. + * + * There will be a mask for every GROUP BY element (keys_size masks in total). + * Every mask has 16 bytes but only sizeof(Key) bytes are used (other we don't care). + * + * Every byte in the mask has the following meaning: + * - if it is 0..15, take the element at this index from source register and place here in the result; + * - if it is 0xFF - set the elemend in the result to zero. + * + * Example: + * We want to copy UInt32 to offset 1 in the destination and set other bytes in the destination as zero. + * The corresponding mask will be: FF, 0, 1, 2, 3, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF + * + * The max size of destination is 16 bytes, because we cannot process more with SSSE3. + * + * The method is disabled under MSan, because it's allowed + * to load into SSE register and process up to 15 bytes of uninitialized memory in columns padding. + * We don't use this uninitialized memory but MSan cannot look "into" the shuffle instruction. + * + * 16-bytes masks can be placed overlapping, only first sizeof(Key) bytes are relevant in each mask. + * We initialize them to 0xFF and then set the needed elements. + */ + size_t total_masks_size = sizeof(Key) * keys_size + (16 - sizeof(Key)); + masks.reset(new uint8_t[total_masks_size]); + memset(masks.get(), 0xFF, total_masks_size); + + size_t offset = 0; + for (size_t i = 0; i < keys_size; ++i) + { + for (size_t j = 0; j < key_sizes[i]; ++j) + { + masks[i * sizeof(Key) + offset] = j; + ++offset; + } + } + + columns_data.reset(new const char*[keys_size]); + + for (size_t i = 0; i < keys_size; ++i) + columns_data[i] = Base::getActualColumns()[i]->getRawData().data; + } +#endif } ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const @@ -506,6 +571,10 @@ struct HashMethodKeysFixed return packFixed(row, keys_size, low_cardinality_keys.nested_columns, key_sizes, &low_cardinality_keys.positions, &low_cardinality_keys.position_sizes); +#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) + if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16) + return packFixedShuffle(columns_data.get(), keys_size, key_sizes.data(), row, masks.get()); +#endif return packFixed(row, keys_size, Base::getActualColumns(), key_sizes); } } diff --git a/src/Interpreters/AggregationCommon.h b/src/Interpreters/AggregationCommon.h index f70ab282e6f..ca9b00184fb 100644 --- a/src/Interpreters/AggregationCommon.h +++ b/src/Interpreters/AggregationCommon.h @@ -15,6 +15,10 @@ #include #include +#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) +#include +#endif + template <> struct DefaultHash : public StringRefHash {}; @@ -255,4 +259,32 @@ static inline StringRef ALWAYS_INLINE serializeKeysToPoolContiguous( } +/** Pack elements with shuffle instruction. + * See the explanation in ColumnsHashing.h + */ +#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) +template +static T ALWAYS_INLINE packFixedShuffle( + const char * __restrict * __restrict srcs, + size_t num_srcs, + const size_t * __restrict elem_sizes, + size_t idx, + const uint8_t * __restrict masks) +{ + __m128i res{}; + + for (size_t i = 0; i < num_srcs; ++i) + { + res = _mm_xor_si128(res, + _mm_shuffle_epi8( + _mm_loadu_si128(reinterpret_cast(srcs[i] + elem_sizes[i] * idx)), + _mm_loadu_si128(reinterpret_cast(&masks[i * sizeof(T)])))); + } + + T out; + __builtin_memcpy(&out, &res, sizeof(T)); + return out; +} +#endif + } diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h index 2a1224b0b48..c5bcc1eb27f 100644 --- a/src/Interpreters/Aggregator.h +++ b/src/Interpreters/Aggregator.h @@ -365,7 +365,13 @@ struct AggregationMethodKeysFixed template AggregationMethodKeysFixed(const Other & other) : data(other.data) {} - using State = ColumnsHashing::HashMethodKeysFixed; + using State = ColumnsHashing::HashMethodKeysFixed< + typename Data::value_type, + Key, + Mapped, + has_nullable_keys, + has_low_cardinality, + use_cache>; static const bool low_cardinality_optimization = false; diff --git a/tests/performance/group_by_fixed_keys.xml b/tests/performance/group_by_fixed_keys.xml new file mode 100644 index 00000000000..0be29ff11ac --- /dev/null +++ b/tests/performance/group_by_fixed_keys.xml @@ -0,0 +1,7 @@ + + WITH toUInt8(number) AS k, toUInt64(k) AS k1, k AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2 + WITH toUInt8(number) AS k, toUInt16(k) AS k1, toUInt32(k) AS k2, k AS k3 SELECT k1, k2, k3, count() FROM numbers(100000000) GROUP BY k1, k2, k3 + WITH toUInt8(number) AS k, k AS k1, k + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2 + WITH toUInt8(number) AS k, k AS k1, k + 1 AS k2, k + 2 AS k3, k + 3 AS k4 SELECT k1, k2, k3, k4, count() FROM numbers(100000000) GROUP BY k1, k2, k3, k4 + WITH toUInt8(number) AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2 +