ClickHouse/src/Common/ColumnsHashingImpl.h

395 lines
11 KiB
C++
Raw Normal View History

2019-01-21 10:39:53 +00:00
#pragma once
#include <Columns/IColumn.h>
#include <Columns/ColumnNullable.h>
#include <Common/assert_cast.h>
#include <Common/HashTable/HashTableKeyHolder.h>
2019-01-21 10:39:53 +00:00
#include <Interpreters/AggregationCommon.h>
2019-01-21 10:39:53 +00:00
namespace DB
{
2020-02-25 18:10:48 +00:00
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
2019-01-21 10:39:53 +00:00
namespace ColumnsHashing
{
2019-02-01 08:23:38 +00:00
/// Generic context for HashMethod. Context is shared between multiple threads, all methods must be thread-safe.
/// Is used for caching.
class HashMethodContext
{
public:
virtual ~HashMethodContext() = default;
struct Settings
{
size_t max_threads;
};
};
using HashMethodContextPtr = std::shared_ptr<HashMethodContext>;
2019-01-21 10:39:53 +00:00
namespace columns_hashing_impl
{
template <typename Value, bool consecutive_keys_optimization_>
struct LastElementCache
{
static constexpr bool consecutive_keys_optimization = consecutive_keys_optimization_;
Value value;
bool empty = true;
bool found = false;
bool check(const Value & value_) { return !empty && value == value_; }
template <typename Key>
2019-08-01 15:57:02 +00:00
bool check(const Key & key) { return !empty && value.first == key; }
2019-01-21 10:39:53 +00:00
};
template <typename Data>
struct LastElementCache<Data, false>
{
static constexpr bool consecutive_keys_optimization = false;
};
template <typename Mapped>
class EmplaceResultImpl
{
Mapped & value;
Mapped & cached_value;
bool inserted;
public:
2019-08-03 11:02:40 +00:00
EmplaceResultImpl(Mapped & value_, Mapped & cached_value_, bool inserted_)
: value(value_), cached_value(cached_value_), inserted(inserted_) {}
2019-01-21 10:39:53 +00:00
bool isInserted() const { return inserted; }
2019-01-24 14:56:04 +00:00
auto & getMapped() const { return value; }
2019-02-06 17:17:59 +00:00
void setMapped(const Mapped & mapped)
{
cached_value = mapped;
value = mapped;
}
2019-01-21 10:39:53 +00:00
};
template <>
class EmplaceResultImpl<void>
{
bool inserted;
public:
2019-08-03 11:02:40 +00:00
explicit EmplaceResultImpl(bool inserted_) : inserted(inserted_) {}
2019-01-21 10:39:53 +00:00
bool isInserted() const { return inserted; }
};
/// FindResult optionally may contain pointer to value and offset in hashtable buffer.
/// Only bool found is required.
/// So we will have 4 different specializations for FindResultImpl
class FindResultImplBase
2019-01-21 10:39:53 +00:00
{
bool found;
public:
explicit FindResultImplBase(bool found_) : found(found_) {}
2019-01-21 10:39:53 +00:00
bool isFound() const { return found; }
};
template <bool need_offset = false>
class FindResultImplOffsetBase
{
public:
constexpr static bool has_offset = need_offset;
explicit FindResultImplOffsetBase(size_t /* off */) {}
2019-01-21 10:39:53 +00:00
};
template <>
class FindResultImplOffsetBase<true>
2019-01-21 10:39:53 +00:00
{
size_t offset;
public:
constexpr static bool has_offset = true;
explicit FindResultImplOffsetBase(size_t off) : offset(off) {}
ALWAYS_INLINE size_t getOffset() const { return offset; }
};
template <typename Mapped, bool need_offset = false>
class FindResultImpl : public FindResultImplBase, public FindResultImplOffsetBase<need_offset>
{
Mapped * value;
2019-01-21 10:39:53 +00:00
public:
FindResultImpl()
: FindResultImplBase(false), FindResultImplOffsetBase<need_offset>(0)
{}
FindResultImpl(Mapped * value_, bool found_, size_t off)
: FindResultImplBase(found_), FindResultImplOffsetBase<need_offset>(off), value(value_) {}
Mapped & getMapped() const { return *value; }
2019-01-21 10:39:53 +00:00
};
template <bool need_offset>
class FindResultImpl<void, need_offset> : public FindResultImplBase, public FindResultImplOffsetBase<need_offset>
{
public:
FindResultImpl(bool found_, size_t off) : FindResultImplBase(found_), FindResultImplOffsetBase<need_offset>(off) {}
};
template <typename Derived, typename Value, typename Mapped, bool consecutive_keys_optimization, bool need_offset = false>
2019-02-01 08:23:38 +00:00
class HashMethodBase
2019-01-21 10:39:53 +00:00
{
2019-02-01 08:23:38 +00:00
public:
2019-01-21 10:39:53 +00:00
using EmplaceResult = EmplaceResultImpl<Mapped>;
using FindResult = FindResultImpl<Mapped, need_offset>;
2022-03-11 13:34:58 +00:00
static constexpr bool has_mapped = !std::is_same_v<Mapped, void>;
2019-01-21 10:39:53 +00:00
using Cache = LastElementCache<Value, consecutive_keys_optimization>;
2019-02-01 08:23:38 +00:00
static HashMethodContextPtr createContext(const HashMethodContext::Settings &) { return nullptr; }
template <typename Data>
ALWAYS_INLINE EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool)
{
auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, pool);
return emplaceImpl(key_holder, data);
2019-02-01 08:23:38 +00:00
}
template <typename Data>
ALWAYS_INLINE FindResult findKey(Data & data, size_t row, Arena & pool)
{
auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, pool);
return findKeyImpl(keyHolderGetKey(key_holder), data);
2019-02-01 08:23:38 +00:00
}
template <typename Data>
ALWAYS_INLINE size_t getHash(const Data & data, size_t row, Arena & pool)
{
auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, pool);
return data.hash(keyHolderGetKey(key_holder));
2019-02-01 08:23:38 +00:00
}
2019-01-21 10:39:53 +00:00
protected:
Cache cache;
HashMethodBase()
{
2019-02-04 14:36:15 +00:00
if constexpr (consecutive_keys_optimization)
2019-01-21 10:39:53 +00:00
{
2019-02-04 14:36:15 +00:00
if constexpr (has_mapped)
{
/// Init PairNoInit elements.
2019-08-01 15:57:02 +00:00
cache.value.second = Mapped();
cache.value.first = {};
2019-02-04 14:36:15 +00:00
}
else
cache.value = Value();
2019-01-21 10:39:53 +00:00
}
}
template <typename Data, typename KeyHolder>
ALWAYS_INLINE EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data)
2019-01-21 10:39:53 +00:00
{
if constexpr (Cache::consecutive_keys_optimization)
{
if (cache.found && cache.check(keyHolderGetKey(key_holder)))
2019-01-21 10:39:53 +00:00
{
if constexpr (has_mapped)
2019-08-01 15:57:02 +00:00
return EmplaceResult(cache.value.second, cache.value.second, false);
2019-01-21 10:39:53 +00:00
else
return EmplaceResult(false);
}
}
typename Data::LookupResult it;
2019-01-21 10:39:53 +00:00
bool inserted = false;
data.emplace(key_holder, it, inserted);
2019-02-01 08:23:38 +00:00
2019-02-04 14:36:15 +00:00
[[maybe_unused]] Mapped * cached = nullptr;
if constexpr (has_mapped)
2019-10-29 15:16:51 +00:00
cached = &it->getMapped();
2019-02-01 08:23:38 +00:00
if (inserted)
{
if constexpr (has_mapped)
2019-02-05 09:43:14 +00:00
{
2019-10-29 15:16:51 +00:00
new (&it->getMapped()) Mapped();
2019-02-05 09:43:14 +00:00
}
2019-02-01 08:23:38 +00:00
}
2019-01-21 10:39:53 +00:00
if constexpr (consecutive_keys_optimization)
{
cache.found = true;
cache.empty = false;
2019-02-04 14:36:15 +00:00
if constexpr (has_mapped)
{
2019-10-29 15:16:51 +00:00
cache.value.first = it->getKey();
cache.value.second = it->getMapped();
2019-08-01 15:57:02 +00:00
cached = &cache.value.second;
}
else
{
2019-10-29 15:16:51 +00:00
cache.value = it->getKey();
}
2019-01-21 10:39:53 +00:00
}
if constexpr (has_mapped)
2019-10-29 15:16:51 +00:00
return EmplaceResult(it->getMapped(), *cached, inserted);
2019-01-21 10:39:53 +00:00
else
return EmplaceResult(inserted);
}
template <typename Data, typename Key>
ALWAYS_INLINE FindResult findKeyImpl(Key key, Data & data)
{
if constexpr (Cache::consecutive_keys_optimization)
{
/// It's possible to support such combination, but code will became more complex.
/// Now there's not place where we need this options enabled together
static_assert(!FindResult::has_offset, "`consecutive_keys_optimization` and `has_offset` are conflicting options");
2019-01-21 10:39:53 +00:00
if (cache.check(key))
{
if constexpr (has_mapped)
return FindResult(&cache.value.second, cache.found, 0);
2019-01-21 10:39:53 +00:00
else
return FindResult(cache.found, 0);
2019-01-21 10:39:53 +00:00
}
}
auto it = data.find(key);
if constexpr (consecutive_keys_optimization)
{
cache.found = it != nullptr;
2019-01-21 10:39:53 +00:00
cache.empty = false;
if constexpr (has_mapped)
{
cache.value.first = key;
if (it)
{
2019-10-29 15:16:51 +00:00
cache.value.second = it->getMapped();
}
}
2019-01-21 10:39:53 +00:00
else
{
cache.value = key;
2019-01-21 10:39:53 +00:00
}
}
size_t offset = 0;
if constexpr (FindResult::has_offset)
{
offset = it ? data.offsetInternal(it) : 0;
}
2019-01-21 10:39:53 +00:00
if constexpr (has_mapped)
return FindResult(it ? &it->getMapped() : nullptr, it != nullptr, offset);
2019-01-21 10:39:53 +00:00
else
return FindResult(it != nullptr, offset);
2019-01-21 10:39:53 +00:00
}
};
template <typename T>
struct MappedCache : public PaddedPODArray<T> {};
template <>
struct MappedCache<void> {};
/// This class is designed to provide the functionality that is required for
/// supporting nullable keys in HashMethodKeysFixed. If there are
/// no nullable keys, this class is merely implemented as an empty shell.
template <typename Key, bool has_nullable_keys>
class BaseStateKeysFixed;
/// Case where nullable keys are supported.
template <typename Key>
class BaseStateKeysFixed<Key, true>
{
protected:
explicit BaseStateKeysFixed(const ColumnRawPtrs & key_columns)
2019-01-21 10:39:53 +00:00
{
null_maps.reserve(key_columns.size());
actual_columns.reserve(key_columns.size());
for (const auto & col : key_columns)
{
if (const auto * nullable_col = checkAndGetColumn<ColumnNullable>(col))
2019-01-21 10:39:53 +00:00
{
2019-06-26 17:20:33 +00:00
actual_columns.push_back(&nullable_col->getNestedColumn());
null_maps.push_back(&nullable_col->getNullMapColumn());
2019-01-21 10:39:53 +00:00
}
else
{
actual_columns.push_back(col);
null_maps.push_back(nullptr);
}
}
}
/// Return the columns which actually contain the values of the keys.
/// For a given key column, if it is nullable, we return its nested
/// column. Otherwise we return the key column itself.
inline const ColumnRawPtrs & getActualColumns() const
{
return actual_columns;
}
/// Create a bitmap that indicates whether, for a particular row,
/// a key column bears a null value or not.
KeysNullMap<Key> createBitmap(size_t row) const
{
KeysNullMap<Key> bitmap{};
for (size_t k = 0; k < null_maps.size(); ++k)
{
if (null_maps[k] != nullptr)
{
const auto & null_map = assert_cast<const ColumnUInt8 &>(*null_maps[k]).getData();
2019-01-21 10:39:53 +00:00
if (null_map[row] == 1)
{
size_t bucket = k / 8;
size_t offset = k % 8;
bitmap[bucket] |= UInt8(1) << offset;
}
}
}
return bitmap;
}
private:
ColumnRawPtrs actual_columns;
ColumnRawPtrs null_maps;
};
/// Case where nullable keys are not supported.
template <typename Key>
class BaseStateKeysFixed<Key, false>
{
protected:
explicit BaseStateKeysFixed(const ColumnRawPtrs & columns) : actual_columns(columns) {}
2019-01-21 10:39:53 +00:00
const ColumnRawPtrs & getActualColumns() const { return actual_columns; }
KeysNullMap<Key> createBitmap(size_t) const
{
throw Exception{"Internal error: calling createBitmap() for non-nullable keys"
" is forbidden", ErrorCodes::LOGICAL_ERROR};
}
private:
ColumnRawPtrs actual_columns;
};
}
}
}