2019-01-21 10:39:24 +00:00
|
|
|
#pragma once
|
2019-01-21 10:39:53 +00:00
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
#include <Common/HashTable/HashTable.h>
|
|
|
|
#include <Common/HashTable/HashTableKeyHolder.h>
|
2019-01-21 10:39:53 +00:00
|
|
|
#include <Common/ColumnsHashingImpl.h>
|
2019-01-21 10:39:24 +00:00
|
|
|
#include <Common/Arena.h>
|
2022-04-30 11:53:59 +00:00
|
|
|
#include <Common/CacheBase.h>
|
2019-08-21 02:28:04 +00:00
|
|
|
#include <Common/assert_cast.h>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/unaligned.h>
|
2019-01-21 10:39:53 +00:00
|
|
|
|
|
|
|
#include <Columns/ColumnString.h>
|
|
|
|
#include <Columns/ColumnFixedString.h>
|
|
|
|
#include <Columns/ColumnLowCardinality.h>
|
|
|
|
|
|
|
|
#include <Core/Defines.h>
|
|
|
|
#include <memory>
|
2021-05-02 22:42:01 +00:00
|
|
|
#include <cassert>
|
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2020-02-25 18:10:48 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int LOGICAL_ERROR;
|
|
|
|
}
|
2019-01-21 10:39:24 +00:00
|
|
|
|
|
|
|
namespace ColumnsHashing
|
|
|
|
{
|
|
|
|
|
2019-02-06 16:58:27 +00:00
|
|
|
/// For the case when there is one numeric key.
|
2019-02-01 08:23:38 +00:00
|
|
|
/// UInt8/16/32/64 for any type with corresponding bit width.
|
2021-02-04 14:46:36 +00:00
|
|
|
template <typename Value, typename Mapped, typename FieldType, bool use_cache = true, bool need_offset = false>
|
2019-02-01 08:23:38 +00:00
|
|
|
struct HashMethodOneNumber
|
2021-02-04 14:46:36 +00:00
|
|
|
: public columns_hashing_impl::HashMethodBase<HashMethodOneNumber<Value, Mapped, FieldType, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2021-02-04 14:46:36 +00:00
|
|
|
using Self = HashMethodOneNumber<Value, Mapped, FieldType, use_cache, need_offset>;
|
|
|
|
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
|
2019-02-01 08:23:38 +00:00
|
|
|
|
2022-09-21 16:59:07 +00:00
|
|
|
static constexpr bool has_cheap_key_calculation = true;
|
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
const char * vec;
|
|
|
|
|
|
|
|
/// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise.
|
|
|
|
HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
|
|
|
|
{
|
2022-08-21 18:10:32 +00:00
|
|
|
vec = key_columns[0]->getRawData().data();
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
2022-03-11 21:47:28 +00:00
|
|
|
explicit HashMethodOneNumber(const IColumn * column)
|
2019-03-28 18:35:50 +00:00
|
|
|
{
|
2022-08-21 18:10:32 +00:00
|
|
|
vec = column->getRawData().data();
|
2019-03-28 18:35:50 +00:00
|
|
|
}
|
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
/// Creates context. Method is called once and result context is used in all threads.
|
2019-02-01 08:23:38 +00:00
|
|
|
using Base::createContext; /// (const HashMethodContext::Settings &) -> HashMethodContextPtr
|
2019-01-21 10:39:24 +00:00
|
|
|
|
|
|
|
/// Emplace key into HashTable or HashMap. If Data is HashMap, returns ptr to value, otherwise nullptr.
|
2019-02-01 08:23:38 +00:00
|
|
|
/// Data is a HashTable where to insert key from column's row.
|
|
|
|
/// For Serialized method, key may be placed in pool.
|
|
|
|
using Base::emplaceKey; /// (Data & data, size_t row, Arena & pool) -> EmplaceResult
|
2019-01-21 10:39:24 +00:00
|
|
|
|
|
|
|
/// Find key into HashTable or HashMap. If Data is HashMap and key was found, returns ptr to value, otherwise nullptr.
|
2019-02-01 08:23:38 +00:00
|
|
|
using Base::findKey; /// (Data & data, size_t row, Arena & pool) -> FindResult
|
2019-01-21 10:39:24 +00:00
|
|
|
|
|
|
|
/// Get hash value of row.
|
2019-02-01 08:23:38 +00:00
|
|
|
using Base::getHash; /// (const Data & data, size_t row, Arena & pool) -> size_t
|
|
|
|
|
|
|
|
/// Is used for default implementation in HashMethodBase.
|
2022-08-11 02:46:06 +00:00
|
|
|
FieldType getKeyHolder(size_t row, Arena &) const { return unalignedLoad<FieldType>(vec + row * sizeof(FieldType)); }
|
2020-07-29 18:35:52 +00:00
|
|
|
|
|
|
|
const FieldType * getKeyData() const { return reinterpret_cast<const FieldType *>(vec); }
|
2019-01-21 10:39:24 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2019-02-06 16:58:27 +00:00
|
|
|
/// For the case when there is one string key.
|
2021-02-04 14:46:36 +00:00
|
|
|
template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true, bool need_offset = false>
|
2019-02-01 08:23:38 +00:00
|
|
|
struct HashMethodString
|
2021-02-04 14:46:36 +00:00
|
|
|
: public columns_hashing_impl::HashMethodBase<HashMethodString<Value, Mapped, place_string_to_arena, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2021-02-04 14:46:36 +00:00
|
|
|
using Self = HashMethodString<Value, Mapped, place_string_to_arena, use_cache, need_offset>;
|
|
|
|
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
|
2019-02-01 08:23:38 +00:00
|
|
|
|
2022-09-21 16:59:07 +00:00
|
|
|
static constexpr bool has_cheap_key_calculation = false;
|
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
const IColumn::Offset * offsets;
|
|
|
|
const UInt8 * chars;
|
|
|
|
|
|
|
|
HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
|
|
|
|
{
|
2022-08-11 02:46:06 +00:00
|
|
|
const IColumn & column = *key_columns[0];
|
|
|
|
const ColumnString & column_string = assert_cast<const ColumnString &>(column);
|
2019-01-21 10:39:24 +00:00
|
|
|
offsets = column_string.getOffsets().data();
|
|
|
|
chars = column_string.getChars().data();
|
|
|
|
}
|
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena & pool) const
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2022-08-11 02:46:06 +00:00
|
|
|
StringRef key(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1);
|
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
if constexpr (place_string_to_arena)
|
|
|
|
{
|
|
|
|
return ArenaKeyHolder{key, pool};
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return key;
|
|
|
|
}
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
2019-02-01 08:23:38 +00:00
|
|
|
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
|
2019-01-21 10:39:24 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2019-02-06 16:58:27 +00:00
|
|
|
/// For the case when there is one fixed-length string key.
|
2021-02-04 14:46:36 +00:00
|
|
|
template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true, bool need_offset = false>
|
2019-02-01 08:23:38 +00:00
|
|
|
struct HashMethodFixedString
|
2021-02-04 14:46:36 +00:00
|
|
|
: public columns_hashing_impl::
|
|
|
|
HashMethodBase<HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2021-02-04 14:46:36 +00:00
|
|
|
using Self = HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache, need_offset>;
|
|
|
|
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
|
2019-02-01 08:23:38 +00:00
|
|
|
|
2022-09-21 16:59:07 +00:00
|
|
|
static constexpr bool has_cheap_key_calculation = false;
|
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
size_t n;
|
|
|
|
const ColumnFixedString::Chars * chars;
|
|
|
|
|
|
|
|
HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
|
|
|
|
{
|
|
|
|
const IColumn & column = *key_columns[0];
|
2019-08-21 02:28:04 +00:00
|
|
|
const ColumnFixedString & column_string = assert_cast<const ColumnFixedString &>(column);
|
2019-01-21 10:39:24 +00:00
|
|
|
n = column_string.getN();
|
|
|
|
chars = &column_string.getChars();
|
|
|
|
}
|
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
auto getKeyHolder(size_t row, [[maybe_unused]] Arena & pool) const
|
|
|
|
{
|
2022-08-11 02:46:06 +00:00
|
|
|
StringRef key(&(*chars)[row * n], n);
|
2019-07-31 15:44:03 +00:00
|
|
|
|
|
|
|
if constexpr (place_string_to_arena)
|
|
|
|
{
|
|
|
|
return ArenaKeyHolder{key, pool};
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return key;
|
|
|
|
}
|
|
|
|
}
|
2019-01-21 10:39:24 +00:00
|
|
|
|
|
|
|
protected:
|
2019-02-01 08:23:38 +00:00
|
|
|
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
|
2019-01-21 10:39:24 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/// Cache stores dictionaries and saved_hash per dictionary key.
|
|
|
|
class LowCardinalityDictionaryCache : public HashMethodContext
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
/// Will assume that dictionaries with same hash has the same keys.
|
|
|
|
/// Just in case, check that they have also the same size.
|
|
|
|
struct DictionaryKey
|
|
|
|
{
|
|
|
|
UInt128 hash;
|
|
|
|
UInt64 size;
|
|
|
|
|
|
|
|
bool operator== (const DictionaryKey & other) const { return hash == other.hash && size == other.size; }
|
|
|
|
};
|
|
|
|
|
|
|
|
struct DictionaryKeyHash
|
|
|
|
{
|
|
|
|
size_t operator()(const DictionaryKey & key) const
|
|
|
|
{
|
|
|
|
SipHash hash;
|
2021-01-27 00:54:57 +00:00
|
|
|
hash.update(key.hash);
|
2019-01-21 10:39:24 +00:00
|
|
|
hash.update(key.size);
|
|
|
|
return hash.get64();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct CachedValues
|
|
|
|
{
|
|
|
|
/// Store ptr to dictionary to be sure it won't be deleted.
|
|
|
|
ColumnPtr dictionary_holder;
|
|
|
|
/// Hashes for dictionary keys.
|
|
|
|
const UInt64 * saved_hash = nullptr;
|
|
|
|
};
|
|
|
|
|
|
|
|
using CachedValuesPtr = std::shared_ptr<CachedValues>;
|
|
|
|
|
|
|
|
explicit LowCardinalityDictionaryCache(const HashMethodContext::Settings & settings) : cache(settings.max_threads) {}
|
|
|
|
|
|
|
|
CachedValuesPtr get(const DictionaryKey & key) { return cache.get(key); }
|
|
|
|
void set(const DictionaryKey & key, const CachedValuesPtr & mapped) { cache.set(key, mapped); }
|
|
|
|
|
|
|
|
private:
|
2022-04-30 11:53:59 +00:00
|
|
|
using Cache = CacheBase<DictionaryKey, CachedValues, DictionaryKeyHash>;
|
2019-01-21 10:39:24 +00:00
|
|
|
Cache cache;
|
|
|
|
};
|
|
|
|
|
2019-01-21 10:39:53 +00:00
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
/// Single low cardinality column.
|
2019-01-21 10:39:53 +00:00
|
|
|
template <typename SingleColumnMethod, typename Mapped, bool use_cache>
|
2019-01-21 10:39:24 +00:00
|
|
|
struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod
|
|
|
|
{
|
|
|
|
using Base = SingleColumnMethod;
|
|
|
|
|
2019-01-21 10:39:53 +00:00
|
|
|
enum class VisitValue
|
|
|
|
{
|
|
|
|
Empty = 0,
|
|
|
|
Found = 1,
|
|
|
|
NotFound = 2,
|
|
|
|
};
|
|
|
|
|
2022-03-11 13:34:58 +00:00
|
|
|
static constexpr bool has_mapped = !std::is_same_v<Mapped, void>;
|
2019-01-21 10:39:53 +00:00
|
|
|
using EmplaceResult = columns_hashing_impl::EmplaceResultImpl<Mapped>;
|
|
|
|
using FindResult = columns_hashing_impl::FindResultImpl<Mapped>;
|
|
|
|
|
2022-09-21 16:59:07 +00:00
|
|
|
static constexpr bool has_cheap_key_calculation = Base::has_cheap_key_calculation;
|
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
static HashMethodContextPtr createContext(const HashMethodContext::Settings & settings)
|
|
|
|
{
|
|
|
|
return std::make_shared<LowCardinalityDictionaryCache>(settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnRawPtrs key_columns;
|
|
|
|
const IColumn * positions = nullptr;
|
|
|
|
size_t size_of_index_type = 0;
|
|
|
|
|
|
|
|
/// saved hash is from current column or from cache.
|
|
|
|
const UInt64 * saved_hash = nullptr;
|
|
|
|
/// Hold dictionary in case saved_hash is from cache to be sure it won't be deleted.
|
|
|
|
ColumnPtr dictionary_holder;
|
|
|
|
|
|
|
|
/// Cache AggregateDataPtr for current column in order to decrease the number of hash table usages.
|
2019-01-21 10:39:53 +00:00
|
|
|
columns_hashing_impl::MappedCache<Mapped> mapped_cache;
|
|
|
|
PaddedPODArray<VisitValue> visit_cache;
|
2019-01-21 10:39:24 +00:00
|
|
|
|
|
|
|
/// If initialized column is nullable.
|
|
|
|
bool is_nullable = false;
|
|
|
|
|
2020-03-19 23:48:53 +00:00
|
|
|
static const ColumnLowCardinality & getLowCardinalityColumn(const IColumn * column)
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * low_cardinality_column = typeid_cast<const ColumnLowCardinality *>(column);
|
2020-03-19 23:48:53 +00:00
|
|
|
if (!low_cardinality_column)
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid aggregation key type for HashMethodSingleLowCardinalityColumn method. "
|
|
|
|
"Excepted LowCardinality, got {}", column->getName());
|
2020-03-19 23:48:53 +00:00
|
|
|
return *low_cardinality_column;
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
HashMethodSingleLowCardinalityColumn(
|
|
|
|
const ColumnRawPtrs & key_columns_low_cardinality, const Sizes & key_sizes, const HashMethodContextPtr & context)
|
|
|
|
: Base({getLowCardinalityColumn(key_columns_low_cardinality[0]).getDictionary().getNestedNotNullableColumn().get()}, key_sizes, context)
|
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * column = &getLowCardinalityColumn(key_columns_low_cardinality[0]);
|
2019-01-21 10:39:24 +00:00
|
|
|
|
|
|
|
if (!context)
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache wasn't created for HashMethodSingleLowCardinalityColumn");
|
2019-01-21 10:39:24 +00:00
|
|
|
|
2019-08-03 11:02:40 +00:00
|
|
|
LowCardinalityDictionaryCache * lcd_cache;
|
2019-01-21 10:39:24 +00:00
|
|
|
if constexpr (use_cache)
|
|
|
|
{
|
2019-08-03 11:02:40 +00:00
|
|
|
lcd_cache = typeid_cast<LowCardinalityDictionaryCache *>(context.get());
|
|
|
|
if (!lcd_cache)
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
|
|
|
const auto & cached_val = *context;
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid type for HashMethodSingleLowCardinalityColumn cache: {}",
|
|
|
|
demangle(typeid(cached_val).name()));
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * dict = column->getDictionary().getNestedNotNullableColumn().get();
|
2019-01-21 10:39:24 +00:00
|
|
|
is_nullable = column->getDictionary().nestedColumnIsNullable();
|
|
|
|
key_columns = {dict};
|
|
|
|
bool is_shared_dict = column->isSharedDictionary();
|
|
|
|
|
|
|
|
typename LowCardinalityDictionaryCache::DictionaryKey dictionary_key;
|
|
|
|
typename LowCardinalityDictionaryCache::CachedValuesPtr cached_values;
|
|
|
|
|
|
|
|
if (is_shared_dict)
|
|
|
|
{
|
|
|
|
dictionary_key = {column->getDictionary().getHash(), dict->size()};
|
|
|
|
if constexpr (use_cache)
|
2019-08-03 11:02:40 +00:00
|
|
|
cached_values = lcd_cache->get(dictionary_key);
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (cached_values)
|
|
|
|
{
|
|
|
|
saved_hash = cached_values->saved_hash;
|
|
|
|
dictionary_holder = cached_values->dictionary_holder;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
saved_hash = column->getDictionary().tryGetSavedHash();
|
|
|
|
dictionary_holder = column->getDictionaryPtr();
|
|
|
|
|
|
|
|
if constexpr (use_cache)
|
|
|
|
{
|
|
|
|
if (is_shared_dict)
|
|
|
|
{
|
|
|
|
cached_values = std::make_shared<typename LowCardinalityDictionaryCache::CachedValues>();
|
|
|
|
cached_values->saved_hash = saved_hash;
|
|
|
|
cached_values->dictionary_holder = dictionary_holder;
|
|
|
|
|
2019-08-03 11:02:40 +00:00
|
|
|
lcd_cache->set(dictionary_key, cached_values);
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-01-21 10:39:53 +00:00
|
|
|
if constexpr (has_mapped)
|
|
|
|
mapped_cache.resize(key_columns[0]->size());
|
|
|
|
|
|
|
|
VisitValue empty(VisitValue::Empty);
|
|
|
|
visit_cache.assign(key_columns[0]->size(), empty);
|
2019-01-21 10:39:24 +00:00
|
|
|
|
|
|
|
size_of_index_type = column->getSizeOfIndexType();
|
|
|
|
positions = column->getIndexesPtr().get();
|
|
|
|
}
|
|
|
|
|
|
|
|
ALWAYS_INLINE size_t getIndexAt(size_t row) const
|
|
|
|
{
|
|
|
|
switch (size_of_index_type)
|
|
|
|
{
|
2019-08-21 02:28:04 +00:00
|
|
|
case sizeof(UInt8): return assert_cast<const ColumnUInt8 *>(positions)->getElement(row);
|
|
|
|
case sizeof(UInt16): return assert_cast<const ColumnUInt16 *>(positions)->getElement(row);
|
|
|
|
case sizeof(UInt32): return assert_cast<const ColumnUInt32 *>(positions)->getElement(row);
|
|
|
|
case sizeof(UInt64): return assert_cast<const ColumnUInt64 *>(positions)->getElement(row);
|
2023-01-23 21:13:58 +00:00
|
|
|
default: throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected size of index type for low cardinality column.");
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
/// Get the key holder from the key columns for insertion into the hash table.
|
|
|
|
ALWAYS_INLINE auto getKeyHolder(size_t row, Arena & pool) const
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2019-07-31 15:44:03 +00:00
|
|
|
return Base::getKeyHolder(getIndexAt(row), pool);
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename Data>
|
2019-01-21 10:39:53 +00:00
|
|
|
ALWAYS_INLINE EmplaceResult emplaceKey(Data & data, size_t row_, Arena & pool)
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
|
|
|
size_t row = getIndexAt(row_);
|
|
|
|
|
|
|
|
if (is_nullable && row == 0)
|
|
|
|
{
|
2019-01-21 10:39:53 +00:00
|
|
|
visit_cache[row] = VisitValue::Found;
|
2019-01-24 14:56:04 +00:00
|
|
|
bool has_null_key = data.hasNullKeyData();
|
|
|
|
data.hasNullKeyData() = true;
|
|
|
|
|
2019-01-21 10:39:53 +00:00
|
|
|
if constexpr (has_mapped)
|
2019-01-24 14:56:04 +00:00
|
|
|
return EmplaceResult(data.getNullKeyData(), mapped_cache[0], !has_null_key);
|
2019-01-21 10:39:53 +00:00
|
|
|
else
|
2019-01-24 14:56:04 +00:00
|
|
|
return EmplaceResult(!has_null_key);
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
2019-01-21 10:39:53 +00:00
|
|
|
if (visit_cache[row] == VisitValue::Found)
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2019-01-21 10:39:53 +00:00
|
|
|
if constexpr (has_mapped)
|
|
|
|
return EmplaceResult(mapped_cache[row], mapped_cache[row], false);
|
|
|
|
else
|
|
|
|
return EmplaceResult(false);
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
auto key_holder = getKeyHolder(row_, pool);
|
2019-01-21 10:39:24 +00:00
|
|
|
|
2019-01-21 10:39:53 +00:00
|
|
|
bool inserted = false;
|
2019-08-20 09:58:44 +00:00
|
|
|
typename Data::LookupResult it;
|
2019-01-21 10:39:24 +00:00
|
|
|
if (saved_hash)
|
2019-07-31 15:44:03 +00:00
|
|
|
data.emplace(key_holder, it, inserted, saved_hash[row]);
|
2019-01-21 10:39:24 +00:00
|
|
|
else
|
2019-07-31 15:44:03 +00:00
|
|
|
data.emplace(key_holder, it, inserted);
|
2019-01-21 10:39:24 +00:00
|
|
|
|
2019-01-21 10:39:53 +00:00
|
|
|
visit_cache[row] = VisitValue::Found;
|
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
if constexpr (has_mapped)
|
2019-02-01 08:23:38 +00:00
|
|
|
{
|
2019-10-29 15:16:51 +00:00
|
|
|
auto & mapped = it->getMapped();
|
2019-07-31 15:44:03 +00:00
|
|
|
if (inserted)
|
2019-02-05 09:43:14 +00:00
|
|
|
{
|
2019-08-20 09:58:44 +00:00
|
|
|
new (&mapped) Mapped();
|
2019-02-05 09:43:14 +00:00
|
|
|
}
|
2019-08-20 09:58:44 +00:00
|
|
|
mapped_cache[row] = mapped;
|
|
|
|
return EmplaceResult(mapped, mapped_cache[row], inserted);
|
2019-02-12 14:38:29 +00:00
|
|
|
}
|
2019-01-21 10:39:53 +00:00
|
|
|
else
|
|
|
|
return EmplaceResult(inserted);
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ALWAYS_INLINE bool isNullAt(size_t i)
|
|
|
|
{
|
|
|
|
if (!is_nullable)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return getIndexAt(i) == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename Data>
|
2022-02-10 15:38:09 +00:00
|
|
|
ALWAYS_INLINE FindResult findKey(Data & data, size_t row_, Arena & pool)
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
|
|
|
size_t row = getIndexAt(row_);
|
|
|
|
|
|
|
|
if (is_nullable && row == 0)
|
2019-01-21 10:39:53 +00:00
|
|
|
{
|
|
|
|
if constexpr (has_mapped)
|
2022-02-10 15:38:09 +00:00
|
|
|
return FindResult(data.hasNullKeyData() ? &data.getNullKeyData() : nullptr, data.hasNullKeyData(), 0);
|
2019-01-21 10:39:53 +00:00
|
|
|
else
|
2022-02-10 15:38:09 +00:00
|
|
|
return FindResult(data.hasNullKeyData(), 0);
|
2019-01-21 10:39:53 +00:00
|
|
|
}
|
2019-01-21 10:39:24 +00:00
|
|
|
|
2019-01-21 10:39:53 +00:00
|
|
|
if (visit_cache[row] != VisitValue::Empty)
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2019-01-21 10:39:53 +00:00
|
|
|
if constexpr (has_mapped)
|
2022-02-10 15:38:09 +00:00
|
|
|
return FindResult(&mapped_cache[row], visit_cache[row] == VisitValue::Found, 0);
|
2019-01-21 10:39:53 +00:00
|
|
|
else
|
2022-02-10 15:38:09 +00:00
|
|
|
return FindResult(visit_cache[row] == VisitValue::Found, 0);
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
auto key_holder = getKeyHolder(row_, pool);
|
2019-01-21 10:39:24 +00:00
|
|
|
|
2022-02-10 15:38:09 +00:00
|
|
|
typename Data::LookupResult it;
|
2019-01-21 10:39:24 +00:00
|
|
|
if (saved_hash)
|
2022-02-10 15:38:09 +00:00
|
|
|
it = data.find(keyHolderGetKey(key_holder), saved_hash[row]);
|
2019-01-21 10:39:24 +00:00
|
|
|
else
|
2022-02-10 15:38:09 +00:00
|
|
|
it = data.find(keyHolderGetKey(key_holder));
|
2019-01-21 10:39:24 +00:00
|
|
|
|
2022-02-10 15:38:09 +00:00
|
|
|
bool found = it;
|
2019-01-21 10:39:53 +00:00
|
|
|
visit_cache[row] = found ? VisitValue::Found : VisitValue::NotFound;
|
|
|
|
|
|
|
|
if constexpr (has_mapped)
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
|
|
|
if (found)
|
2022-02-10 15:38:09 +00:00
|
|
|
mapped_cache[row] = it->getMapped();
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
2022-02-13 13:43:02 +00:00
|
|
|
|
2022-02-10 15:38:09 +00:00
|
|
|
size_t offset = 0;
|
|
|
|
|
|
|
|
if constexpr (FindResult::has_offset)
|
|
|
|
offset = found ? data.offsetInternal(it) : 0;
|
2019-01-21 10:39:24 +00:00
|
|
|
|
2019-01-21 10:39:53 +00:00
|
|
|
if constexpr (has_mapped)
|
2022-02-10 15:38:09 +00:00
|
|
|
return FindResult(&mapped_cache[row], found, offset);
|
2019-01-21 10:39:53 +00:00
|
|
|
else
|
2022-02-10 15:38:09 +00:00
|
|
|
return FindResult(found, offset);
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename Data>
|
|
|
|
ALWAYS_INLINE size_t getHash(const Data & data, size_t row, Arena & pool)
|
|
|
|
{
|
|
|
|
row = getIndexAt(row);
|
|
|
|
if (saved_hash)
|
|
|
|
return saved_hash[row];
|
|
|
|
|
|
|
|
return Base::getHash(data, row, pool);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// Optional mask for low cardinality columns.
|
|
|
|
template <bool has_low_cardinality>
|
|
|
|
struct LowCardinalityKeys
|
|
|
|
{
|
|
|
|
ColumnRawPtrs nested_columns;
|
|
|
|
ColumnRawPtrs positions;
|
|
|
|
Sizes position_sizes;
|
|
|
|
};
|
|
|
|
|
|
|
|
template <>
|
|
|
|
struct LowCardinalityKeys<false> {};
|
|
|
|
|
2019-02-06 16:58:27 +00:00
|
|
|
/// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
|
2021-02-13 22:56:04 +00:00
|
|
|
template <
|
|
|
|
typename Value,
|
|
|
|
typename Key,
|
|
|
|
typename Mapped,
|
|
|
|
bool has_nullable_keys_ = false,
|
|
|
|
bool has_low_cardinality_ = false,
|
|
|
|
bool use_cache = true,
|
|
|
|
bool need_offset = false>
|
2019-01-21 10:39:53 +00:00
|
|
|
struct HashMethodKeysFixed
|
|
|
|
: private columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>
|
2021-02-04 14:46:36 +00:00
|
|
|
, public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2021-02-04 14:46:36 +00:00
|
|
|
using Self = HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>;
|
|
|
|
using BaseHashed = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
|
2019-02-01 08:23:38 +00:00
|
|
|
using Base = columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>;
|
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
static constexpr bool has_nullable_keys = has_nullable_keys_;
|
|
|
|
static constexpr bool has_low_cardinality = has_low_cardinality_;
|
|
|
|
|
2022-09-21 16:59:07 +00:00
|
|
|
static constexpr bool has_cheap_key_calculation = true;
|
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
LowCardinalityKeys<has_low_cardinality> low_cardinality_keys;
|
|
|
|
Sizes key_sizes;
|
|
|
|
size_t keys_size;
|
|
|
|
|
2021-02-13 22:56:04 +00:00
|
|
|
/// SSSE3 shuffle method can be used. Shuffle masks will be calculated and stored here.
|
|
|
|
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
|
|
|
|
std::unique_ptr<uint8_t[]> masks;
|
|
|
|
std::unique_ptr<const char*[]> columns_data;
|
|
|
|
#endif
|
|
|
|
|
2021-02-25 15:51:01 +00:00
|
|
|
PaddedPODArray<Key> prepared_keys;
|
|
|
|
|
2021-03-10 11:00:24 +00:00
|
|
|
static bool usePreparedKeys(const Sizes & key_sizes)
|
|
|
|
{
|
|
|
|
if (has_low_cardinality || has_nullable_keys || sizeof(Key) > 16)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (auto size : key_sizes)
|
|
|
|
if (size != 1 && size != 2 && size != 4 && size != 8 && size != 16)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-08-03 11:02:40 +00:00
|
|
|
HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &)
|
2022-03-11 21:47:28 +00:00
|
|
|
: Base(key_columns), key_sizes(key_sizes_), keys_size(key_columns.size())
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
|
|
|
if constexpr (has_low_cardinality)
|
|
|
|
{
|
|
|
|
low_cardinality_keys.nested_columns.resize(key_columns.size());
|
|
|
|
low_cardinality_keys.positions.assign(key_columns.size(), nullptr);
|
|
|
|
low_cardinality_keys.position_sizes.resize(key_columns.size());
|
|
|
|
for (size_t i = 0; i < key_columns.size(); ++i)
|
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
if (const auto * low_cardinality_col = typeid_cast<const ColumnLowCardinality *>(key_columns[i]))
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
|
|
|
low_cardinality_keys.nested_columns[i] = low_cardinality_col->getDictionary().getNestedColumn().get();
|
|
|
|
low_cardinality_keys.positions[i] = &low_cardinality_col->getIndexes();
|
|
|
|
low_cardinality_keys.position_sizes[i] = low_cardinality_col->getSizeOfIndexType();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
low_cardinality_keys.nested_columns[i] = key_columns[i];
|
|
|
|
}
|
|
|
|
}
|
2021-02-13 22:56:04 +00:00
|
|
|
|
2021-03-10 11:00:24 +00:00
|
|
|
if (usePreparedKeys(key_sizes))
|
2021-02-26 10:17:00 +00:00
|
|
|
{
|
2021-03-10 11:00:24 +00:00
|
|
|
packFixedBatch(keys_size, Base::getActualColumns(), key_sizes, prepared_keys);
|
2021-02-26 10:17:00 +00:00
|
|
|
}
|
2021-02-25 15:51:01 +00:00
|
|
|
|
2021-02-13 22:56:04 +00:00
|
|
|
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
|
2021-03-10 11:00:24 +00:00
|
|
|
else if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
|
2021-02-13 22:56:04 +00:00
|
|
|
{
|
|
|
|
/** The task is to "pack" multiple fixed-size fields into single larger Key.
|
|
|
|
* Example: pack UInt8, UInt32, UInt16, UInt64 into UInt128 key:
|
|
|
|
* [- ---- -- -------- -] - the resulting uint128 key
|
|
|
|
* ^ ^ ^ ^ ^
|
|
|
|
* u8 u32 u16 u64 zero
|
|
|
|
*
|
|
|
|
* We can do it with the help of SSSE3 shuffle instruction.
|
|
|
|
*
|
|
|
|
* There will be a mask for every GROUP BY element (keys_size masks in total).
|
|
|
|
* Every mask has 16 bytes but only sizeof(Key) bytes are used (other we don't care).
|
|
|
|
*
|
|
|
|
* Every byte in the mask has the following meaning:
|
|
|
|
* - if it is 0..15, take the element at this index from source register and place here in the result;
|
|
|
|
* - if it is 0xFF - set the elemend in the result to zero.
|
|
|
|
*
|
|
|
|
* Example:
|
|
|
|
* We want to copy UInt32 to offset 1 in the destination and set other bytes in the destination as zero.
|
|
|
|
* The corresponding mask will be: FF, 0, 1, 2, 3, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF
|
|
|
|
*
|
|
|
|
* The max size of destination is 16 bytes, because we cannot process more with SSSE3.
|
|
|
|
*
|
|
|
|
* The method is disabled under MSan, because it's allowed
|
|
|
|
* to load into SSE register and process up to 15 bytes of uninitialized memory in columns padding.
|
|
|
|
* We don't use this uninitialized memory but MSan cannot look "into" the shuffle instruction.
|
|
|
|
*
|
|
|
|
* 16-bytes masks can be placed overlapping, only first sizeof(Key) bytes are relevant in each mask.
|
|
|
|
* We initialize them to 0xFF and then set the needed elements.
|
|
|
|
*/
|
|
|
|
size_t total_masks_size = sizeof(Key) * keys_size + (16 - sizeof(Key));
|
|
|
|
masks.reset(new uint8_t[total_masks_size]);
|
|
|
|
memset(masks.get(), 0xFF, total_masks_size);
|
|
|
|
|
|
|
|
size_t offset = 0;
|
|
|
|
for (size_t i = 0; i < keys_size; ++i)
|
|
|
|
{
|
|
|
|
for (size_t j = 0; j < key_sizes[i]; ++j)
|
|
|
|
{
|
|
|
|
masks[i * sizeof(Key) + offset] = j;
|
|
|
|
++offset;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
columns_data.reset(new const char*[keys_size]);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < keys_size; ++i)
|
2022-08-21 18:10:32 +00:00
|
|
|
columns_data[i] = Base::getActualColumns()[i]->getRawData().data();
|
2021-02-13 22:56:04 +00:00
|
|
|
}
|
|
|
|
#endif
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2019-02-01 10:14:17 +00:00
|
|
|
if constexpr (has_nullable_keys)
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
|
|
|
auto bitmap = Base::createBitmap(row);
|
|
|
|
return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes, bitmap);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if constexpr (has_low_cardinality)
|
|
|
|
return packFixed<Key, true>(row, keys_size, low_cardinality_keys.nested_columns, key_sizes,
|
|
|
|
&low_cardinality_keys.positions, &low_cardinality_keys.position_sizes);
|
|
|
|
|
2021-02-25 15:51:01 +00:00
|
|
|
if (!prepared_keys.empty())
|
|
|
|
return prepared_keys[row];
|
|
|
|
|
2021-02-13 22:56:04 +00:00
|
|
|
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
|
2021-05-02 22:42:01 +00:00
|
|
|
if constexpr (sizeof(Key) <= 16)
|
|
|
|
{
|
|
|
|
assert(!has_low_cardinality && !has_nullable_keys);
|
2021-02-13 22:56:04 +00:00
|
|
|
return packFixedShuffle<Key>(columns_data.get(), keys_size, key_sizes.data(), row, masks.get());
|
2021-05-02 22:42:01 +00:00
|
|
|
}
|
2021-02-13 22:56:04 +00:00
|
|
|
#endif
|
2019-01-21 10:39:24 +00:00
|
|
|
return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes);
|
|
|
|
}
|
|
|
|
}
|
2021-03-10 11:00:24 +00:00
|
|
|
|
|
|
|
static std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> & key_columns, const Sizes & key_sizes)
|
|
|
|
{
|
|
|
|
if (!usePreparedKeys(key_sizes))
|
|
|
|
return {};
|
|
|
|
|
|
|
|
std::vector<IColumn *> new_columns;
|
|
|
|
new_columns.reserve(key_columns.size());
|
|
|
|
|
|
|
|
Sizes new_sizes;
|
|
|
|
auto fill_size = [&](size_t size)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < key_sizes.size(); ++i)
|
|
|
|
{
|
|
|
|
if (key_sizes[i] == size)
|
|
|
|
{
|
|
|
|
new_columns.push_back(key_columns[i]);
|
|
|
|
new_sizes.push_back(size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
fill_size(16);
|
|
|
|
fill_size(8);
|
|
|
|
fill_size(4);
|
|
|
|
fill_size(2);
|
|
|
|
fill_size(1);
|
|
|
|
|
|
|
|
key_columns.swap(new_columns);
|
|
|
|
return new_sizes;
|
|
|
|
}
|
2019-01-21 10:39:24 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/** Hash by concatenating serialized key values.
|
|
|
|
* The serialized value differs in that it uniquely allows to deserialize it, having only the position with which it starts.
|
|
|
|
* That is, for example, for strings, it contains first the serialized length of the string, and then the bytes.
|
|
|
|
* Therefore, when aggregating by several strings, there is no ambiguity.
|
|
|
|
*/
|
2019-02-01 08:23:38 +00:00
|
|
|
template <typename Value, typename Mapped>
|
|
|
|
struct HashMethodSerialized
|
|
|
|
: public columns_hashing_impl::HashMethodBase<HashMethodSerialized<Value, Mapped>, Value, Mapped, false>
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2019-02-01 08:23:38 +00:00
|
|
|
using Self = HashMethodSerialized<Value, Mapped>;
|
|
|
|
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
|
|
|
|
|
2022-09-21 16:59:07 +00:00
|
|
|
static constexpr bool has_cheap_key_calculation = false;
|
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
ColumnRawPtrs key_columns;
|
|
|
|
size_t keys_size;
|
|
|
|
|
2019-08-03 11:02:40 +00:00
|
|
|
HashMethodSerialized(const ColumnRawPtrs & key_columns_, const Sizes & /*key_sizes*/, const HashMethodContextPtr &)
|
|
|
|
: key_columns(key_columns_), keys_size(key_columns_.size()) {}
|
2019-01-21 10:39:24 +00:00
|
|
|
|
2019-02-01 08:23:38 +00:00
|
|
|
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
|
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
ALWAYS_INLINE SerializedKeyHolder getKeyHolder(size_t row, Arena & pool) const
|
2019-01-21 10:39:24 +00:00
|
|
|
{
|
2019-07-31 15:44:03 +00:00
|
|
|
return SerializedKeyHolder{
|
|
|
|
serializeKeysToPoolContiguous(row, keys_size, key_columns, pool),
|
|
|
|
pool};
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2019-02-06 16:58:27 +00:00
|
|
|
/// For the case when there is one string key.
|
2021-02-04 14:46:36 +00:00
|
|
|
template <typename Value, typename Mapped, bool use_cache = true, bool need_offset = false>
|
2019-02-01 08:23:38 +00:00
|
|
|
struct HashMethodHashed
|
2021-02-04 14:46:36 +00:00
|
|
|
: public columns_hashing_impl::HashMethodBase<HashMethodHashed<Value, Mapped, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
|
2019-01-24 14:56:04 +00:00
|
|
|
{
|
|
|
|
using Key = UInt128;
|
2021-02-04 14:46:36 +00:00
|
|
|
using Self = HashMethodHashed<Value, Mapped, use_cache, need_offset>;
|
|
|
|
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
|
2019-01-24 14:56:04 +00:00
|
|
|
|
2022-09-21 16:59:07 +00:00
|
|
|
static constexpr bool has_cheap_key_calculation = false;
|
|
|
|
|
2019-01-24 14:56:04 +00:00
|
|
|
ColumnRawPtrs key_columns;
|
|
|
|
|
2019-08-03 11:02:40 +00:00
|
|
|
HashMethodHashed(ColumnRawPtrs key_columns_, const Sizes &, const HashMethodContextPtr &)
|
|
|
|
: key_columns(std::move(key_columns_)) {}
|
2019-01-24 14:56:04 +00:00
|
|
|
|
2019-07-31 15:44:03 +00:00
|
|
|
ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
|
|
|
|
{
|
|
|
|
return hash128(row, key_columns.size(), key_columns);
|
|
|
|
}
|
2019-01-24 14:56:04 +00:00
|
|
|
};
|
|
|
|
|
2019-01-21 10:39:24 +00:00
|
|
|
}
|
|
|
|
}
|