Improve performance of GROUP BY multiple fixed size keys

This commit is contained in:
Alexey Milovidov 2021-02-14 01:56:04 +03:00
parent 024d8491d6
commit 5bdc570046
4 changed files with 116 additions and 2 deletions

View File

@ -455,7 +455,14 @@ template <>
struct LowCardinalityKeys<false> {};
/// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
template <typename Value, typename Key, typename Mapped, bool has_nullable_keys_ = false, bool has_low_cardinality_ = false, bool use_cache = true, bool need_offset = false>
template <
typename Value,
typename Key,
typename Mapped,
bool has_nullable_keys_ = false,
bool has_low_cardinality_ = false,
bool use_cache = true,
bool need_offset = false>
struct HashMethodKeysFixed
: private columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>
, public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
@ -471,6 +478,12 @@ struct HashMethodKeysFixed
Sizes key_sizes;
size_t keys_size;
/// SSSE3 shuffle method can be used. Shuffle masks will be calculated and stored here.
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
std::unique_ptr<uint8_t[]> masks;
std::unique_ptr<const char*[]> columns_data;
#endif
HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &)
: Base(key_columns), key_sizes(std::move(key_sizes_)), keys_size(key_columns.size())
{
@ -491,6 +504,58 @@ struct HashMethodKeysFixed
low_cardinality_keys.nested_columns[i] = key_columns[i];
}
}
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
{
/** The task is to "pack" multiple fixed-size fields into single larger Key.
* Example: pack UInt8, UInt32, UInt16, UInt64 into UInt128 key:
* [- ---- -- -------- -] - the resulting uint128 key
* ^ ^ ^ ^ ^
* u8 u32 u16 u64 zero
*
* We can do it with the help of SSSE3 shuffle instruction.
*
* There will be a mask for every GROUP BY element (keys_size masks in total).
* Every mask has 16 bytes but only sizeof(Key) bytes are used (other we don't care).
*
* Every byte in the mask has the following meaning:
* - if it is 0..15, take the element at this index from source register and place here in the result;
* - if it is 0xFF - set the elemend in the result to zero.
*
* Example:
* We want to copy UInt32 to offset 1 in the destination and set other bytes in the destination as zero.
* The corresponding mask will be: FF, 0, 1, 2, 3, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF
*
* The max size of destination is 16 bytes, because we cannot process more with SSSE3.
*
* The method is disabled under MSan, because it's allowed
* to load into SSE register and process up to 15 bytes of uninitialized memory in columns padding.
* We don't use this uninitialized memory but MSan cannot look "into" the shuffle instruction.
*
* 16-bytes masks can be placed overlapping, only first sizeof(Key) bytes are relevant in each mask.
* We initialize them to 0xFF and then set the needed elements.
*/
size_t total_masks_size = sizeof(Key) * keys_size + (16 - sizeof(Key));
masks.reset(new uint8_t[total_masks_size]);
memset(masks.get(), 0xFF, total_masks_size);
size_t offset = 0;
for (size_t i = 0; i < keys_size; ++i)
{
for (size_t j = 0; j < key_sizes[i]; ++j)
{
masks[i * sizeof(Key) + offset] = j;
++offset;
}
}
columns_data.reset(new const char*[keys_size]);
for (size_t i = 0; i < keys_size; ++i)
columns_data[i] = Base::getActualColumns()[i]->getRawData().data;
}
#endif
}
ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
@ -506,6 +571,10 @@ struct HashMethodKeysFixed
return packFixed<Key, true>(row, keys_size, low_cardinality_keys.nested_columns, key_sizes,
&low_cardinality_keys.positions, &low_cardinality_keys.position_sizes);
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
return packFixedShuffle<Key>(columns_data.get(), keys_size, key_sizes.data(), row, masks.get());
#endif
return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes);
}
}

View File

@ -15,6 +15,10 @@
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnLowCardinality.h>
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
#include <tmmintrin.h>
#endif
template <>
struct DefaultHash<StringRef> : public StringRefHash {};
@ -255,4 +259,32 @@ static inline StringRef ALWAYS_INLINE serializeKeysToPoolContiguous(
}
/** Pack elements with shuffle instruction.
* See the explanation in ColumnsHashing.h
*/
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
template <typename T>
static T ALWAYS_INLINE packFixedShuffle(
const char * __restrict * __restrict srcs,
size_t num_srcs,
const size_t * __restrict elem_sizes,
size_t idx,
const uint8_t * __restrict masks)
{
__m128i res{};
for (size_t i = 0; i < num_srcs; ++i)
{
res = _mm_xor_si128(res,
_mm_shuffle_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(srcs[i] + elem_sizes[i] * idx)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&masks[i * sizeof(T)]))));
}
T out;
__builtin_memcpy(&out, &res, sizeof(T));
return out;
}
#endif
}

View File

@ -365,7 +365,13 @@ struct AggregationMethodKeysFixed
template <typename Other>
AggregationMethodKeysFixed(const Other & other) : data(other.data) {}
using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, Mapped, has_nullable_keys, has_low_cardinality, use_cache>;
using State = ColumnsHashing::HashMethodKeysFixed<
typename Data::value_type,
Key,
Mapped,
has_nullable_keys,
has_low_cardinality,
use_cache>;
static const bool low_cardinality_optimization = false;

View File

@ -0,0 +1,7 @@
<test>
<query>WITH toUInt8(number) AS k, toUInt64(k) AS k1, k AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
<query>WITH toUInt8(number) AS k, toUInt16(k) AS k1, toUInt32(k) AS k2, k AS k3 SELECT k1, k2, k3, count() FROM numbers(100000000) GROUP BY k1, k2, k3</query>
<query>WITH toUInt8(number) AS k, k AS k1, k + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
<query>WITH toUInt8(number) AS k, k AS k1, k + 1 AS k2, k + 2 AS k3, k + 3 AS k4 SELECT k1, k2, k3, k4, count() FROM numbers(100000000) GROUP BY k1, k2, k3, k4</query>
<query>WITH toUInt8(number) AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
</test>