mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
Improve performance of GROUP BY multiple fixed size keys
This commit is contained in:
parent
024d8491d6
commit
5bdc570046
@ -455,7 +455,14 @@ template <>
|
||||
struct LowCardinalityKeys<false> {};
|
||||
|
||||
/// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
|
||||
template <typename Value, typename Key, typename Mapped, bool has_nullable_keys_ = false, bool has_low_cardinality_ = false, bool use_cache = true, bool need_offset = false>
|
||||
template <
|
||||
typename Value,
|
||||
typename Key,
|
||||
typename Mapped,
|
||||
bool has_nullable_keys_ = false,
|
||||
bool has_low_cardinality_ = false,
|
||||
bool use_cache = true,
|
||||
bool need_offset = false>
|
||||
struct HashMethodKeysFixed
|
||||
: private columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>
|
||||
, public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
|
||||
@ -471,6 +478,12 @@ struct HashMethodKeysFixed
|
||||
Sizes key_sizes;
|
||||
size_t keys_size;
|
||||
|
||||
/// SSSE3 shuffle method can be used. Shuffle masks will be calculated and stored here.
|
||||
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
|
||||
std::unique_ptr<uint8_t[]> masks;
|
||||
std::unique_ptr<const char*[]> columns_data;
|
||||
#endif
|
||||
|
||||
HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &)
|
||||
: Base(key_columns), key_sizes(std::move(key_sizes_)), keys_size(key_columns.size())
|
||||
{
|
||||
@ -491,6 +504,58 @@ struct HashMethodKeysFixed
|
||||
low_cardinality_keys.nested_columns[i] = key_columns[i];
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
|
||||
if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
|
||||
{
|
||||
/** The task is to "pack" multiple fixed-size fields into single larger Key.
|
||||
* Example: pack UInt8, UInt32, UInt16, UInt64 into UInt128 key:
|
||||
* [- ---- -- -------- -] - the resulting uint128 key
|
||||
* ^ ^ ^ ^ ^
|
||||
* u8 u32 u16 u64 zero
|
||||
*
|
||||
* We can do it with the help of SSSE3 shuffle instruction.
|
||||
*
|
||||
* There will be a mask for every GROUP BY element (keys_size masks in total).
|
||||
* Every mask has 16 bytes but only sizeof(Key) bytes are used (other we don't care).
|
||||
*
|
||||
* Every byte in the mask has the following meaning:
|
||||
* - if it is 0..15, take the element at this index from source register and place here in the result;
|
||||
* - if it is 0xFF - set the elemend in the result to zero.
|
||||
*
|
||||
* Example:
|
||||
* We want to copy UInt32 to offset 1 in the destination and set other bytes in the destination as zero.
|
||||
* The corresponding mask will be: FF, 0, 1, 2, 3, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF
|
||||
*
|
||||
* The max size of destination is 16 bytes, because we cannot process more with SSSE3.
|
||||
*
|
||||
* The method is disabled under MSan, because it's allowed
|
||||
* to load into SSE register and process up to 15 bytes of uninitialized memory in columns padding.
|
||||
* We don't use this uninitialized memory but MSan cannot look "into" the shuffle instruction.
|
||||
*
|
||||
* 16-bytes masks can be placed overlapping, only first sizeof(Key) bytes are relevant in each mask.
|
||||
* We initialize them to 0xFF and then set the needed elements.
|
||||
*/
|
||||
size_t total_masks_size = sizeof(Key) * keys_size + (16 - sizeof(Key));
|
||||
masks.reset(new uint8_t[total_masks_size]);
|
||||
memset(masks.get(), 0xFF, total_masks_size);
|
||||
|
||||
size_t offset = 0;
|
||||
for (size_t i = 0; i < keys_size; ++i)
|
||||
{
|
||||
for (size_t j = 0; j < key_sizes[i]; ++j)
|
||||
{
|
||||
masks[i * sizeof(Key) + offset] = j;
|
||||
++offset;
|
||||
}
|
||||
}
|
||||
|
||||
columns_data.reset(new const char*[keys_size]);
|
||||
|
||||
for (size_t i = 0; i < keys_size; ++i)
|
||||
columns_data[i] = Base::getActualColumns()[i]->getRawData().data;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
|
||||
@ -506,6 +571,10 @@ struct HashMethodKeysFixed
|
||||
return packFixed<Key, true>(row, keys_size, low_cardinality_keys.nested_columns, key_sizes,
|
||||
&low_cardinality_keys.positions, &low_cardinality_keys.position_sizes);
|
||||
|
||||
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
|
||||
if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
|
||||
return packFixedShuffle<Key>(columns_data.get(), keys_size, key_sizes.data(), row, masks.get());
|
||||
#endif
|
||||
return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes);
|
||||
}
|
||||
}
|
||||
|
@ -15,6 +15,10 @@
|
||||
#include <Columns/ColumnFixedString.h>
|
||||
#include <Columns/ColumnLowCardinality.h>
|
||||
|
||||
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
template <>
|
||||
struct DefaultHash<StringRef> : public StringRefHash {};
|
||||
@ -255,4 +259,32 @@ static inline StringRef ALWAYS_INLINE serializeKeysToPoolContiguous(
|
||||
}
|
||||
|
||||
|
||||
/** Pack elements with shuffle instruction.
|
||||
* See the explanation in ColumnsHashing.h
|
||||
*/
|
||||
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
|
||||
template <typename T>
|
||||
static T ALWAYS_INLINE packFixedShuffle(
|
||||
const char * __restrict * __restrict srcs,
|
||||
size_t num_srcs,
|
||||
const size_t * __restrict elem_sizes,
|
||||
size_t idx,
|
||||
const uint8_t * __restrict masks)
|
||||
{
|
||||
__m128i res{};
|
||||
|
||||
for (size_t i = 0; i < num_srcs; ++i)
|
||||
{
|
||||
res = _mm_xor_si128(res,
|
||||
_mm_shuffle_epi8(
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i *>(srcs[i] + elem_sizes[i] * idx)),
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&masks[i * sizeof(T)]))));
|
||||
}
|
||||
|
||||
T out;
|
||||
__builtin_memcpy(&out, &res, sizeof(T));
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
@ -365,7 +365,13 @@ struct AggregationMethodKeysFixed
|
||||
template <typename Other>
|
||||
AggregationMethodKeysFixed(const Other & other) : data(other.data) {}
|
||||
|
||||
using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, Mapped, has_nullable_keys, has_low_cardinality, use_cache>;
|
||||
using State = ColumnsHashing::HashMethodKeysFixed<
|
||||
typename Data::value_type,
|
||||
Key,
|
||||
Mapped,
|
||||
has_nullable_keys,
|
||||
has_low_cardinality,
|
||||
use_cache>;
|
||||
|
||||
static const bool low_cardinality_optimization = false;
|
||||
|
||||
|
7
tests/performance/group_by_fixed_keys.xml
Normal file
7
tests/performance/group_by_fixed_keys.xml
Normal file
@ -0,0 +1,7 @@
|
||||
<test>
|
||||
<query>WITH toUInt8(number) AS k, toUInt64(k) AS k1, k AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
|
||||
<query>WITH toUInt8(number) AS k, toUInt16(k) AS k1, toUInt32(k) AS k2, k AS k3 SELECT k1, k2, k3, count() FROM numbers(100000000) GROUP BY k1, k2, k3</query>
|
||||
<query>WITH toUInt8(number) AS k, k AS k1, k + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
|
||||
<query>WITH toUInt8(number) AS k, k AS k1, k + 1 AS k2, k + 2 AS k3, k + 3 AS k4 SELECT k1, k2, k3, k4, count() FROM numbers(100000000) GROUP BY k1, k2, k3, k4</query>
|
||||
<query>WITH toUInt8(number) AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
|
||||
</test>
|
Loading…
Reference in New Issue
Block a user