ClickHouse/src/Interpreters/SetVariants.h

286 lines
9.6 KiB
C++
Raw Normal View History

#pragma once
#include <Interpreters/AggregationCommon.h>
2019-02-04 14:36:15 +00:00
#include <Common/ColumnsHashing.h>
#include <Common/assert_cast.h>
#include <Common/Arena.h>
#include <Common/HashTable/HashSet.h>
2017-04-07 16:13:25 +00:00
#include <Common/HashTable/ClearableHashSet.h>
A Proper lookup table that uses HashTable's API This is the first step of allowing heterogeneous cells in hash tables. performance test results are ``` 1. HashMap<UInt16, UInt8, TrivialHash, HashTableFixedGrower<16>>; 2. NewLookupMap<UInt16, UInt8> ResolutionWidth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................223550276.46 ResolutionWidth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................248772721.24 Best: 2 - 24877272124 ResolutionWidth 100000 1 ..........................................................................................................................................................................................................................................................238498413.99 ResolutionWidth 100000 2 ..........................................................................................................................................................................................................................................................261808889.98 Best: 2 - 26180888998 ResolutionWidth 300000 1 ...................................................................................239307348.81 ResolutionWidth 300000 2 ...................................................................................257592761.30 Best: 2 - 25759276130 ResolutionWidth 1000000 1 .........................240144759.26 ResolutionWidth 1000000 2 .........................257093531.91 Best: 2 - 25709353191 ResolutionWidth 5000000 1 .....241573260.35 ResolutionWidth 5000000 2 .....259314162.79 Best: 2 - 25931416279 ResolutionDepth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................217108119.84 ResolutionDepth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................249459504.41 Best: 2 - 24945950441 ResolutionDepth 100000 1 ..........................................................................................................................................................................................................................................................229065162.17 ResolutionDepth 100000 2 ..........................................................................................................................................................................................................................................................253769105.64 Best: 2 - 25376910564 ResolutionDepth 300000 1 ...................................................................................233079225.18 ResolutionDepth 300000 2 ...................................................................................256316273.78 Best: 2 - 25631627378 ResolutionDepth 1000000 1 .........................234184633.51 ResolutionDepth 1000000 2 .........................261100491.57 Best: 2 - 26110049157 ResolutionDepth 5000000 1 .....233118795.66 ResolutionDepth 5000000 2 .....252436160.41 Best: 2 - 25243616041 ```
2019-02-28 09:35:38 +00:00
#include <Common/HashTable/FixedClearableHashSet.h>
#include <Common/HashTable/FixedHashSet.h>
2017-03-03 21:15:46 +00:00
namespace DB
{
2020-02-25 18:10:48 +00:00
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
2017-03-03 21:15:46 +00:00
/** Methods for different implementations of sets (used in right hand side of IN or for DISTINCT).
* To use as template parameter.
*/
2017-06-02 21:37:28 +00:00
/// For the case where there is one numeric key.
template <typename FieldType, typename TData, bool use_cache = true> /// UInt8/16/32/64 for any types with corresponding bit width.
struct SetMethodOneNumber
{
using Data = TData;
using Key = typename Data::key_type;
Data data;
using State = ColumnsHashing::HashMethodOneNumber<typename Data::value_type,
void, FieldType, use_cache>;
};
2017-06-02 21:37:28 +00:00
/// For the case where there is one string key.
template <typename TData>
struct SetMethodString
{
using Data = TData;
using Key = typename Data::key_type;
Data data;
using State = ColumnsHashing::HashMethodString<typename Data::value_type, void, true, false>;
};
2017-06-02 21:37:28 +00:00
/// For the case when there is one fixed-length string key.
template <typename TData>
struct SetMethodFixedString
{
using Data = TData;
using Key = typename Data::key_type;
Data data;
using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, void, true, false>;
};
namespace set_impl
{
/// This class is designed to provide the functionality that is required for
/// supporting nullable keys in SetMethodKeysFixed. If there are
/// no nullable keys, this class is merely implemented as an empty shell.
template <typename Key, bool has_nullable_keys>
class BaseStateKeysFixed;
/// Case where nullable keys are supported.
template <typename Key>
class BaseStateKeysFixed<Key, true>
{
protected:
void init(const ColumnRawPtrs & key_columns)
{
null_maps.reserve(key_columns.size());
actual_columns.reserve(key_columns.size());
for (const auto & col : key_columns)
{
2019-06-27 18:50:20 +00:00
if (auto * nullable = checkAndGetColumn<ColumnNullable>(*col))
{
2019-06-26 17:20:33 +00:00
actual_columns.push_back(&nullable->getNestedColumn());
null_maps.push_back(&nullable->getNullMapColumn());
}
else
{
actual_columns.push_back(col);
null_maps.push_back(nullptr);
}
}
}
/// Return the columns which actually contain the values of the keys.
/// For a given key column, if it is nullable, we return its nested
/// column. Otherwise we return the key column itself.
inline const ColumnRawPtrs & getActualColumns() const
{
return actual_columns;
}
/// Create a bitmap that indicates whether, for a particular row,
/// a key column bears a null value or not.
KeysNullMap<Key> createBitmap(size_t row) const
{
KeysNullMap<Key> bitmap{};
for (size_t k = 0; k < null_maps.size(); ++k)
{
if (null_maps[k] != nullptr)
{
const auto & null_map = assert_cast<const ColumnUInt8 &>(*null_maps[k]).getData();
if (null_map[row] == 1)
{
size_t bucket = k / 8;
size_t offset = k % 8;
bitmap[bucket] |= UInt8(1) << offset;
}
}
}
return bitmap;
}
private:
ColumnRawPtrs actual_columns;
ColumnRawPtrs null_maps;
};
/// Case where nullable keys are not supported.
template <typename Key>
class BaseStateKeysFixed<Key, false>
{
protected:
void init(const ColumnRawPtrs &)
{
throw Exception{"Internal error: calling init() for non-nullable"
" keys is forbidden", ErrorCodes::LOGICAL_ERROR};
}
const ColumnRawPtrs & getActualColumns() const
{
throw Exception{"Internal error: calling getActualColumns() for non-nullable"
" keys is forbidden", ErrorCodes::LOGICAL_ERROR};
}
2017-12-01 19:34:51 +00:00
KeysNullMap<Key> createBitmap(size_t) const
{
throw Exception{"Internal error: calling createBitmap() for non-nullable keys"
" is forbidden", ErrorCodes::LOGICAL_ERROR};
}
};
}
2017-06-02 21:37:28 +00:00
/// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
template <typename TData, bool has_nullable_keys_ = false>
struct SetMethodKeysFixed
{
using Data = TData;
using Key = typename Data::key_type;
static constexpr bool has_nullable_keys = has_nullable_keys_;
Data data;
2019-02-04 14:36:15 +00:00
using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, void, has_nullable_keys, false>;
};
2017-06-02 21:37:28 +00:00
/// For other cases. 128 bit hash from the key.
template <typename TData>
struct SetMethodHashed
{
using Data = TData;
using Key = typename Data::key_type;
Data data;
2019-02-04 14:36:15 +00:00
using State = ColumnsHashing::HashMethodHashed<typename Data::value_type, void>;
};
2017-06-02 21:37:28 +00:00
/** Different implementations of the set.
*/
2017-04-07 16:13:25 +00:00
struct NonClearableSet
{
/*
* As in Aggregator, using consecutive keys cache doesn't improve performance
* for FixedHashTables.
*/
2020-04-14 22:25:31 +00:00
std::unique_ptr<SetMethodOneNumber<UInt8, FixedHashSet<UInt8>, false /* use_cache */>> key8;
std::unique_ptr<SetMethodOneNumber<UInt16, FixedHashSet<UInt16>, false /* use_cache */>> key16;
2017-06-02 21:37:28 +00:00
/** Also for the experiment was tested the ability to use SmallSet,
* as long as the number of elements in the set is small (and, if necessary, converted to a full-fledged HashSet).
* But this experiment showed that there is an advantage only in rare cases.
*/
2020-04-14 22:25:31 +00:00
std::unique_ptr<SetMethodOneNumber<UInt32, HashSet<UInt32, HashCRC32<UInt32>>>> key32;
std::unique_ptr<SetMethodOneNumber<UInt64, HashSet<UInt64, HashCRC32<UInt64>>>> key64;
std::unique_ptr<SetMethodString<HashSetWithSavedHash<StringRef>>> key_string;
std::unique_ptr<SetMethodFixedString<HashSetWithSavedHash<StringRef>>> key_fixed_string;
std::unique_ptr<SetMethodKeysFixed<HashSet<UInt128, UInt128HashCRC32>>> keys128;
2021-01-27 00:54:57 +00:00
std::unique_ptr<SetMethodKeysFixed<HashSet<UInt256, UInt256HashCRC32>>> keys256;
2020-04-14 22:25:31 +00:00
std::unique_ptr<SetMethodHashed<HashSet<UInt128, UInt128TrivialHash>>> hashed;
/// Support for nullable keys (for DISTINCT implementation).
2020-04-14 22:25:31 +00:00
std::unique_ptr<SetMethodKeysFixed<HashSet<UInt128, UInt128HashCRC32>, true>> nullable_keys128;
2021-01-27 00:54:57 +00:00
std::unique_ptr<SetMethodKeysFixed<HashSet<UInt256, UInt256HashCRC32>, true>> nullable_keys256;
2017-06-02 21:37:28 +00:00
/** Unlike Aggregator, `concat` method is not used here.
* This is done because `hashed` method, although slower, but in this case, uses less RAM.
* since when you use it, the key values themselves are not stored.
2017-04-07 16:13:25 +00:00
*/
};
struct ClearableSet
{
2020-04-14 22:25:31 +00:00
std::unique_ptr<SetMethodOneNumber<UInt8, FixedClearableHashSet<UInt8>, false /* use_cache */>> key8;
std::unique_ptr<SetMethodOneNumber<UInt16, FixedClearableHashSet<UInt16>, false /*use_cache */>> key16;
2020-04-14 22:25:31 +00:00
std::unique_ptr<SetMethodOneNumber<UInt32, ClearableHashSet<UInt32, HashCRC32<UInt32>>>> key32;
std::unique_ptr<SetMethodOneNumber<UInt64, ClearableHashSet<UInt64, HashCRC32<UInt64>>>> key64;
std::unique_ptr<SetMethodString<ClearableHashSetWithSavedHash<StringRef>>> key_string;
std::unique_ptr<SetMethodFixedString<ClearableHashSetWithSavedHash<StringRef>>> key_fixed_string;
std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt128, UInt128HashCRC32>>> keys128;
2021-01-27 00:54:57 +00:00
std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt256, UInt256HashCRC32>>> keys256;
2020-04-14 22:25:31 +00:00
std::unique_ptr<SetMethodHashed<ClearableHashSet<UInt128, UInt128TrivialHash>>> hashed;
2017-04-07 16:13:25 +00:00
/// Support for nullable keys (for DISTINCT implementation).
2020-04-14 22:25:31 +00:00
std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt128, UInt128HashCRC32>, true>> nullable_keys128;
2021-01-27 00:54:57 +00:00
std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt256, UInt256HashCRC32>, true>> nullable_keys256;
2017-06-02 21:37:28 +00:00
/** Unlike Aggregator, `concat` method is not used here.
* This is done because `hashed` method, although slower, but in this case, uses less RAM.
* since when you use it, the key values themselves are not stored.
*/
2017-04-07 16:13:25 +00:00
};
2017-04-07 16:13:25 +00:00
template <typename Variant>
struct SetVariantsTemplate: public Variant
{
Arena string_pool;
#define APPLY_FOR_SET_VARIANTS(M) \
2017-04-07 16:13:25 +00:00
M(key8) \
M(key16) \
M(key32) \
M(key64) \
M(key_string) \
M(key_fixed_string) \
M(keys128) \
M(keys256) \
M(nullable_keys128) \
M(nullable_keys256) \
M(hashed)
2017-04-07 16:13:25 +00:00
#define M(NAME) using Variant::NAME;
APPLY_FOR_SET_VARIANTS(M)
#undef M
enum class Type
{
EMPTY,
#define M(NAME) NAME,
APPLY_FOR_SET_VARIANTS(M)
#undef M
};
Type type = Type::EMPTY;
bool empty() const { return type == Type::EMPTY; }
static Type chooseMethod(const ColumnRawPtrs & key_columns, Sizes & key_sizes);
void init(Type type_);
size_t getTotalRowCount() const;
2017-06-02 21:37:28 +00:00
/// Counts the size in bytes of the Set buffer and the size of the `string_pool`
size_t getTotalByteCount() const;
};
2017-04-07 16:13:25 +00:00
using SetVariants = SetVariantsTemplate<NonClearableSet>;
using ClearableSetVariants = SetVariantsTemplate<ClearableSet>;
}