mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-14 18:32:29 +00:00
342ec02664
The problem is that GROUP BY can update saved_hash, which can be also
updated by subsequent update of a dictionary, and this will lead to
use-after-free.
You will find ASan report in `details`.
<details>
==24679==ERROR: AddressSanitizer: heap-use-after-free on address 0x604000615d20 at pc 0x000022cc8684 bp 0x7ffea6b5f850 sp 0x7ffea6b5f848
READ of size 8 at 0x604000615d20 thread T223 (QueryPipelineEx)
0 0x22cc8683 in DB::ReverseIndex<>::insert(StringRef const&) obj-x86_64-linux-gnu/../src/Columns/ReverseIndex.h
1 0x22cc0de1 in COW<DB::IColumn>::mutable_ptr<DB::IColumn> DB::ColumnUnique<DB::ColumnString>::uniqueInsertRangeImpl<char8_t>()::'lambda'()::operator()() const obj-x86_64-linux-gnu/../src/Columns/ColumnUnique.h:540:39
2 0x22cc0de1 in COW<DB::IColumn>::mutable_ptr<DB::IColumn> DB::ColumnUnique<DB::ColumnString>::uniqueInsertRangeImpl<char8_t>() obj-x86_64-linux-gnu/../src/Columns/ColumnUnique.h:570:23
3 0x22cb9c66 in COW<DB::IColumn>::mutable_ptr<DB::IColumn> DB::ColumnUnique<DB::ColumnString>::uniqueInsertRangeFrom()::'lambda'(auto)::operator()<char8_t>(auto) const obj-x86_64-linux-gnu/../src/Columns/ColumnUnique.h:592:26
4 0x22cb9c66 in DB::ColumnUnique<DB::ColumnString>::uniqueInsertRangeFrom() obj-x86_64-linux-gnu/../src/Columns/ColumnUnique.h:600:28
5 0x2500b897 in DB::ColumnLowCardinality::insertRangeFrom() obj-x86_64-linux-gnu/../src/Columns/ColumnLowCardinality.cpp:205:62
6 0x25a182f4 in DB::appendBlock(DB::Block const&, DB::Block&) obj-x86_64-linux-gnu/../src/Storages/StorageBuffer.cpp:470:23
7 0x25a182f4 in DB::BufferSink::insertIntoBuffer(DB::Block const&, DB::StorageBuffer::Buffer&) obj-x86_64-linux-gnu/../src/Storages/StorageBuffer.cpp:634:9
8 0x25a173cc in DB::BufferSink::consume(DB::Chunk) obj-x86_64-linux-gnu/../src/Storages/StorageBuffer.cpp:595:9
9 0x26d1c997 in DB::SinkToStorage::transform(DB::Chunk&) obj-x86_64-linux-gnu/../src/Processors/Sinks/SinkToStorage.cpp:18:5
0x604000615d20 is located 16 bytes inside of 40-byte region [0x604000615d10,0x604000615d38)
freed by thread T37 (QueryPipelineEx) here:
2 0x22cb9392 in boost::intrusive_ptr<DB::ColumnVector<unsigned long> >::~intrusive_ptr() obj-x86_64-linux-gnu/../contrib/boost/boost/smart_ptr/intrusive_ptr.hpp:98:23
4 0x22cb9392 in COW<DB::IColumn>::mutable_ptr<DB::ColumnVector<unsigned long> >::operator=() obj-x86_64-linux-gnu/../src/Common/COW.h💯57
5 0x22cb9392 in DB::ReverseIndex<>::tryGetSavedHash() const obj-x86_64-linux-gnu/../src/Columns/ReverseIndex.h:362:28
6 0x22cb9392 in DB::ColumnUnique<DB::ColumnString>::tryGetSavedHash() const obj-x86_64-linux-gnu/../src/Columns/ColumnUnique.h:125:76
7 0x242eaed3 in DB::ColumnsHashing::HashMethodSingleLowCardinalityColumn<>::HashMethodSingleLowCardinalityColumn() obj-x86_64-linux-gnu/../src/Common/ColumnsHashing.h:287:50
8 0x242206c6 in void DB::Aggregator::executeImpl<>() const obj-x86_64-linux-gnu/../src/Interpreters/Aggregator.cpp:596:28
9 0x24148e99 in DB::Aggregator::executeOnBlock() const obj-x86_64-linux-gnu/../src/Interpreters/Aggregator.cpp:1004:9
10 0x26c24f3a in DB::AggregatingTransform::consume(DB::Chunk) obj-x86_64-linux-gnu/../src/Processors/Transforms/AggregatingTransform.cpp:539:33
11 0x26c2054e in DB::AggregatingTransform::work() obj-x86_64-linux-gnu/../src/Processors/Transforms/AggregatingTransform.cpp:500:9
previously allocated by thread T37 (QueryPipelineEx) here:
0 0xb6d44fd in operator new(unsigned long) (/src/ch/tmp/upstream/clickhouse-asan+0xb6d44fd)
1 0x11b78580 in COW<DB::IColumn>::mutable_ptr<> COWHelper<>::create<unsigned long&>(unsigned long&) (/src/ch/tmp/upstream/clickhouse-asan+0x11b78580)
2 0x22cbf7b1 in DB::ReverseIndex<>::calcHashes() const obj-x86_64-linux-gnu/../src/Columns/ReverseIndex.h:472:17
3 0x22cc2307 in DB::ReverseIndex<>::buildIndex() obj-x86_64-linux-gnu/../src/Columns/ReverseIndex.h:438:22
4 0x22cc658c in DB::ReverseIndex<>::insert(StringRef const&) obj-x86_64-linux-gnu/../src/Columns/ReverseIndex.h:484:9
5 0x22cc0de1 in COW<DB::IColumn>::mutable_ptr<DB::IColumn> DB::ColumnUnique<DB::ColumnString>::uniqueInsertRangeImpl<char8_t>()::'lambda'()::operator()() const obj-x86_64-linux-gnu/../src/Columns/ColumnUnique.h:540:39
6 0x22cc0de1 in COW<DB::IColumn>::mutable_ptr<DB::IColumn> DB::ColumnUnique<DB::ColumnString>::uniqueInsertRangeImpl<char8_t>() obj-x86_64-linux-gnu/../src/Columns/ColumnUnique.h:570:23
7 0x22cb9c66 in COW<DB::IColumn>::mutable_ptr<DB::IColumn> DB::ColumnUnique<DB::ColumnString>::uniqueInsertRangeFrom()::'lambda'(auto)::operator()<char8_t>(auto) const obj-x86_64-linux-gnu/../src/Columns/ColumnUnique.h:592:26
8 0x22cb9c66 in DB::ColumnUnique<DB::ColumnString>::uniqueInsertRangeFrom() obj-x86_64-linux-gnu/../src/Columns/ColumnUnique.h:600:28
9 0x2500b897 in DB::ColumnLowCardinality::insertRangeFrom() obj-x86_64-linux-gnu/../src/Columns/ColumnLowCardinality.cpp:205:62
10 0x25a182f4 in DB::appendBlock(DB::Block const&, DB::Block&) obj-x86_64-linux-gnu/../src/Storages/StorageBuffer.cpp:470:23
11 0x25a182f4 in DB::BufferSink::insertIntoBuffer(DB::Block const&, DB::StorageBuffer::Buffer&) obj-x86_64-linux-gnu/../src/Storages/StorageBuffer.cpp:634:9
12 0x25a173cc in DB::BufferSink::consume(DB::Chunk) obj-x86_64-linux-gnu/../src/Storages/StorageBuffer.cpp:595:9
13 0x26d1c997 in DB::SinkToStorage::transform(DB::Chunk&) obj-x86_64-linux-gnu/../src/Processors/Sinks/SinkToStorage.cpp:18:5
SUMMARY: AddressSanitizer: heap-use-after-free obj-x86_64-linux-gnu/../src/Columns/ReverseIndex.h in DB::ReverseIndex<unsigned long, DB::ColumnString>::insert(StringRef const&)
Shadow bytes around the buggy address:
0x0c08800bab50: fa fa fd fd fd fd fd fa fa fa 00 00 00 00 00 00
0x0c08800bab60: fa fa fd fd fd fd fd fa fa fa 00 00 00 00 00 fa
0x0c08800bab70: fa fa 00 00 00 00 00 00 fa fa 00 00 00 00 00 00
0x0c08800bab80: fa fa fd fd fd fd fd fa fa fa fd fd fd fd fd fa
0x0c08800bab90: fa fa fd fd fd fd fd fa fa fa fd fd fd fd fd fd
=>0x0c08800baba0: fa fa fd fd[fd]fd fd fa fa fa fd fd fd fd fd fa
0x0c08800babb0: fa fa fd fd fd fd fd fd fa fa fd fd fd fd fd fa
0x0c08800babc0: fa fa fd fd fd fd fd fa fa fa fd fd fd fd fd fd
0x0c08800babd0: fa fa fd fd fd fd fd fd fa fa fd fd fd fd fd fa
0x0c08800babe0: fa fa fd fd fd fd fd fa fa fa fd fd fd fd fd fd
0x0c08800babf0: fa fa fd fd fd fd fd fd fa fa fd fd fd fd fd fd
Shadow byte legend (one shadow byte represents 8 application bytes):
Addressable: 00
Partially addressable: 01 02 03 04 05 06 07
Heap left redzone: fa
Freed heap region: fd
Stack left redzone: f1
Stack mid redzone: f2
Stack right redzone: f3
Stack after return: f5
Stack use after scope: f8
Global redzone: f9
Global init order: f6
Poisoned by user: f7
Container overflow: fc
Array cookie: ac
Intra object redzone: bb
ASan internal: fe
Left alloca redzone: ca
Right alloca redzone: cb
==24679==ABORTING
</details>
533 lines
18 KiB
C++
533 lines
18 KiB
C++
#pragma once
|
|
|
|
#include <Common/HashTable/Hash.h>
|
|
#include <Common/HashTable/HashTable.h>
|
|
#include <Common/HashTable/HashTableAllocator.h>
|
|
#include <Columns/ColumnString.h>
|
|
#include <Columns/ColumnsNumber.h>
|
|
#include <Common/assert_cast.h>
|
|
#include <base/range.h>
|
|
#include <base/unaligned.h>
|
|
|
|
|
|
namespace DB
|
|
{
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int LOGICAL_ERROR;
|
|
}
|
|
|
|
namespace
|
|
{
|
|
template <typename ColumnType, bool with_saved_hash, bool has_base_index>
|
|
struct ReverseIndexHashTableState;
|
|
|
|
template <typename ColumnType>
|
|
struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash */ false, /* has_base_index */ false>
|
|
{
|
|
constexpr static bool with_saved_hash = false;
|
|
constexpr static bool has_base_index = false;
|
|
|
|
ColumnType * index_column;
|
|
};
|
|
|
|
template <typename ColumnType>
|
|
struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash */ false, /* has_base_index */ true>
|
|
{
|
|
constexpr static bool with_saved_hash = false;
|
|
constexpr static bool has_base_index = true;
|
|
|
|
ColumnType * index_column;
|
|
size_t base_index;
|
|
};
|
|
|
|
template <typename ColumnType>
|
|
struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash = */ true, /* has_base_index */ false>
|
|
{
|
|
constexpr static bool with_saved_hash = true;
|
|
constexpr static bool has_base_index = false;
|
|
|
|
ColumnType * index_column;
|
|
typename ColumnVector<UInt64>::Container * saved_hash_column;
|
|
};
|
|
|
|
template <typename ColumnType>
|
|
struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash = */ true, /* has_base_index */ true>
|
|
{
|
|
constexpr static bool with_saved_hash = true;
|
|
constexpr static bool has_base_index = true;
|
|
|
|
ColumnType * index_column;
|
|
typename ColumnVector<UInt64>::Container * saved_hash_column;
|
|
size_t base_index;
|
|
};
|
|
|
|
|
|
struct ReverseIndexHash
|
|
{
|
|
template <typename T>
|
|
size_t operator()(T) const
|
|
{
|
|
throw Exception("operator()(key) is not implemented for ReverseIndexHash.", ErrorCodes::LOGICAL_ERROR);
|
|
}
|
|
};
|
|
|
|
template <typename IndexType, typename Hash, typename HashTable, typename ColumnType, bool string_hash, bool has_base_index>
|
|
struct ReverseIndexHashTableCell
|
|
: public HashTableCell<IndexType, Hash, ReverseIndexHashTableState<ColumnType, string_hash, has_base_index>>
|
|
{
|
|
using Base = HashTableCell<IndexType, Hash, ReverseIndexHashTableState<ColumnType, string_hash, has_base_index>>;
|
|
using State = typename Base::State;
|
|
using Base::Base;
|
|
using Base::key;
|
|
using Base::keyEquals;
|
|
using Base::isZero;
|
|
|
|
template <typename T>
|
|
static bool isZero(const T &, const State & /*state*/)
|
|
{
|
|
/// Careful: apparently this uses SFINAE to redefine isZero for all types
|
|
/// except the IndexType, for which the default ZeroTraits::isZero is used.
|
|
static_assert(!std::is_same_v<typename std::decay<T>::type, typename std::decay<IndexType>::type>);
|
|
return false;
|
|
}
|
|
|
|
/// Special case when we want to compare with something not in index_column.
|
|
/// When we compare something inside column default keyEquals checks only that row numbers are equal.
|
|
bool keyEquals(const StringRef & object, size_t hash_ [[maybe_unused]], const State & state) const
|
|
{
|
|
auto index = key;
|
|
if constexpr (has_base_index)
|
|
index -= state.base_index;
|
|
|
|
if constexpr (string_hash)
|
|
return hash_ == (*state.saved_hash_column)[index] && object == state.index_column->getDataAt(index);
|
|
else
|
|
return object == state.index_column->getDataAt(index);
|
|
}
|
|
|
|
size_t getHash(const Hash & hash) const
|
|
{
|
|
auto index = key;
|
|
|
|
/// Hack. HashTable is Hash itself.
|
|
const auto & state = static_cast<const State &>(static_cast<const HashTable &>(hash));
|
|
|
|
if constexpr (has_base_index)
|
|
index -= state.base_index;
|
|
|
|
if constexpr (string_hash)
|
|
return (*state.saved_hash_column)[index];
|
|
else
|
|
{
|
|
using ValueType = typename ColumnType::ValueType;
|
|
ValueType value = unalignedLoad<ValueType>(state.index_column->getDataAt(index).data);
|
|
return DefaultHash<ValueType>()(value);
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
/**
|
|
* ReverseIndexHashTableBase implements a special hash table interface for
|
|
* reverse index.
|
|
*
|
|
* The following requirements are different compared to a plain hash table:
|
|
*
|
|
* 1) Provide public access to 'hash table state' that contains
|
|
* additional data needed to calculate cell hashes.
|
|
*
|
|
* 2) Support emplace() and find() with a Key different from the resulting
|
|
* hash table key. This means emplace() accepts a different kind of object
|
|
* as a key, and then the real key can be read from the returned cell iterator.
|
|
*
|
|
* These requirements are unique to ReverseIndex and are in conflict with
|
|
* supporting hash tables that use alternative key storage, such as FixedHashMap
|
|
* or StringHashMap. Therefore, we implement an interface for ReverseIndex
|
|
* separately.
|
|
*/
|
|
template <typename Key, typename Cell, typename Hash>
|
|
class ReverseIndexHashTableBase : public HashTable<Key, Cell, Hash, HashTableGrower<>, HashTableAllocator>
|
|
{
|
|
using State = typename Cell::State;
|
|
using Base = HashTable<Key, Cell, Hash, HashTableGrower<>, HashTableAllocator>;
|
|
|
|
public:
|
|
using Base::Base;
|
|
using iterator = typename Base::iterator;
|
|
using LookupResult = typename Base::LookupResult;
|
|
State & getState() { return *this; }
|
|
|
|
|
|
template <typename ObjectToCompareWith>
|
|
size_t ALWAYS_INLINE reverseIndexFindCell(const ObjectToCompareWith & x,
|
|
size_t hash_value, size_t place_value) const
|
|
{
|
|
while (!this->buf[place_value].isZero(*this)
|
|
&& !this->buf[place_value].keyEquals(x, hash_value, *this))
|
|
{
|
|
place_value = this->grower.next(place_value);
|
|
}
|
|
|
|
return place_value;
|
|
}
|
|
|
|
template <typename ObjectToCompareWith>
|
|
void ALWAYS_INLINE reverseIndexEmplaceNonZero(const Key & key, LookupResult & it,
|
|
bool & inserted, size_t hash_value, const ObjectToCompareWith & object)
|
|
{
|
|
size_t place_value = reverseIndexFindCell(object, hash_value,
|
|
this->grower.place(hash_value));
|
|
// emplaceNonZeroImpl() might need to re-find the cell if the table grows,
|
|
// but it will find it correctly by the key alone, so we don't have to
|
|
// pass it the 'object'.
|
|
this->emplaceNonZeroImpl(place_value, key, it, inserted, hash_value);
|
|
}
|
|
|
|
/// Searches position by object.
|
|
template <typename ObjectToCompareWith>
|
|
void ALWAYS_INLINE reverseIndexEmplace(Key key, iterator & it, bool & inserted,
|
|
size_t hash_value, const ObjectToCompareWith& object)
|
|
{
|
|
LookupResult impl_it = nullptr;
|
|
|
|
if (!this->emplaceIfZero(key, impl_it, inserted, hash_value))
|
|
{
|
|
reverseIndexEmplaceNonZero(key, impl_it, inserted, hash_value, object);
|
|
}
|
|
assert(impl_it != nullptr);
|
|
it = iterator(this, impl_it);
|
|
}
|
|
|
|
template <typename ObjectToCompareWith>
|
|
iterator ALWAYS_INLINE reverseIndexFind(ObjectToCompareWith x, size_t hash_value)
|
|
{
|
|
if (Cell::isZero(x, *this))
|
|
return this->hasZero() ? this->iteratorToZero() : this->end();
|
|
|
|
size_t place_value = reverseIndexFindCell(x, hash_value,
|
|
this->grower.place(hash_value));
|
|
return !this->buf[place_value].isZero(*this)
|
|
? iterator(this, &this->buf[place_value])
|
|
: this->end();
|
|
}
|
|
};
|
|
|
|
template <typename IndexType, typename ColumnType, bool has_base_index>
|
|
class ReverseIndexStringHashTable : public ReverseIndexHashTableBase<
|
|
IndexType,
|
|
ReverseIndexHashTableCell<
|
|
IndexType,
|
|
ReverseIndexHash,
|
|
ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>,
|
|
ColumnType,
|
|
true,
|
|
has_base_index>,
|
|
ReverseIndexHash>
|
|
{
|
|
using Base = ReverseIndexHashTableBase<
|
|
IndexType,
|
|
ReverseIndexHashTableCell<
|
|
IndexType,
|
|
ReverseIndexHash,
|
|
ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>,
|
|
ColumnType,
|
|
true,
|
|
has_base_index>,
|
|
ReverseIndexHash>;
|
|
public:
|
|
using Base::Base;
|
|
friend struct ReverseIndexHashTableCell<
|
|
IndexType,
|
|
ReverseIndexHash,
|
|
ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>,
|
|
ColumnType,
|
|
true,
|
|
has_base_index>;
|
|
};
|
|
|
|
template <typename IndexType, typename ColumnType, bool has_base_index>
|
|
class ReverseIndexNumberHashTable : public ReverseIndexHashTableBase<
|
|
IndexType,
|
|
ReverseIndexHashTableCell<
|
|
IndexType,
|
|
ReverseIndexHash,
|
|
ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>,
|
|
ColumnType,
|
|
false,
|
|
has_base_index>,
|
|
ReverseIndexHash>
|
|
{
|
|
using Base = ReverseIndexHashTableBase<
|
|
IndexType,
|
|
ReverseIndexHashTableCell<
|
|
IndexType,
|
|
ReverseIndexHash,
|
|
ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>,
|
|
ColumnType,
|
|
false,
|
|
has_base_index>,
|
|
ReverseIndexHash>;
|
|
public:
|
|
using Base::Base;
|
|
friend struct ReverseIndexHashTableCell<
|
|
IndexType,
|
|
ReverseIndexHash,
|
|
ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>,
|
|
ColumnType,
|
|
false,
|
|
has_base_index>;
|
|
};
|
|
|
|
|
|
template <typename IndexType, typename ColumnType, bool has_base_index, bool is_numeric_column>
|
|
struct SelectReverseIndexHashTable;
|
|
|
|
template <typename IndexType, typename ColumnType, bool has_base_index>
|
|
struct SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index, true>
|
|
{
|
|
using Type = ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>;
|
|
};
|
|
|
|
template <typename IndexType, typename ColumnType, bool has_base_index>
|
|
struct SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index, false>
|
|
{
|
|
using Type = ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>;
|
|
};
|
|
|
|
|
|
template <typename T>
|
|
constexpr bool isNumericColumn(const T *) { return false; }
|
|
|
|
template <typename T>
|
|
constexpr bool isNumericColumn(const ColumnVector<T> *) { return true; }
|
|
|
|
static_assert(isNumericColumn(static_cast<ColumnVector<UInt8> *>(nullptr)));
|
|
static_assert(!isNumericColumn(static_cast<ColumnString *>(nullptr)));
|
|
|
|
|
|
template <typename IndexType, typename ColumnType, bool has_base_index>
|
|
using ReverseIndexHashTable = typename SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index,
|
|
isNumericColumn(static_cast<ColumnType *>(nullptr))>::Type;
|
|
}
|
|
|
|
|
|
template <typename IndexType, typename ColumnType>
|
|
class ReverseIndex
|
|
{
|
|
public:
|
|
ReverseIndex(UInt64 num_prefix_rows_to_skip_, UInt64 base_index_)
|
|
: num_prefix_rows_to_skip(num_prefix_rows_to_skip_), base_index(base_index_), external_saved_hash_ptr(nullptr) {}
|
|
|
|
void setColumn(ColumnType * column_);
|
|
|
|
static constexpr bool is_numeric_column = isNumericColumn(static_cast<ColumnType *>(nullptr));
|
|
static constexpr bool use_saved_hash = !is_numeric_column;
|
|
|
|
UInt64 insert(const StringRef & data);
|
|
|
|
/// Returns the found data's index in the dictionary. If index is not built, builds it.
|
|
UInt64 getInsertionPoint(StringRef data)
|
|
{
|
|
if (!index)
|
|
buildIndex();
|
|
return getIndexImpl(data);
|
|
}
|
|
|
|
/// Returns the found data's index in the dictionary if the #index is built, otherwise, returns a std::nullopt.
|
|
std::optional<UInt64> getIndex(StringRef data) const
|
|
{
|
|
if (!index)
|
|
return {};
|
|
return getIndexImpl(data);
|
|
}
|
|
|
|
UInt64 lastInsertionPoint() const { return size() + base_index; }
|
|
|
|
ColumnType * getColumn() const { return column; }
|
|
size_t size() const;
|
|
|
|
const UInt64 * tryGetSavedHash() const
|
|
{
|
|
if (!use_saved_hash)
|
|
return nullptr;
|
|
|
|
UInt64 * ptr = external_saved_hash_ptr.load();
|
|
if (!ptr)
|
|
{
|
|
auto hash = calcHashes();
|
|
ptr = &hash->getData()[0];
|
|
UInt64 * expected = nullptr;
|
|
if (external_saved_hash_ptr.compare_exchange_strong(expected, ptr))
|
|
external_saved_hash = std::move(hash);
|
|
else
|
|
ptr = expected;
|
|
}
|
|
|
|
return ptr;
|
|
}
|
|
|
|
size_t allocatedBytes() const { return index ? index->getBufferSizeInBytes() : 0; }
|
|
|
|
private:
|
|
ColumnType * column = nullptr;
|
|
UInt64 num_prefix_rows_to_skip; /// The number prefix tows in column which won't be sored at index.
|
|
UInt64 base_index; /// This values will be added to row number which is inserted into index.
|
|
|
|
using IndexMapType = ReverseIndexHashTable<IndexType, ColumnType, true>;
|
|
|
|
/// Lazy initialized.
|
|
std::unique_ptr<IndexMapType> index;
|
|
mutable ColumnUInt64::MutablePtr saved_hash;
|
|
/// For usage during GROUP BY
|
|
mutable ColumnUInt64::MutablePtr external_saved_hash;
|
|
mutable std::atomic<UInt64 *> external_saved_hash_ptr;
|
|
|
|
void buildIndex();
|
|
|
|
UInt64 getHash(const StringRef & ref) const
|
|
{
|
|
if constexpr (is_numeric_column)
|
|
{
|
|
using ValueType = typename ColumnType::ValueType;
|
|
ValueType value = unalignedLoad<ValueType>(ref.data);
|
|
return DefaultHash<ValueType>()(value);
|
|
}
|
|
else
|
|
return StringRefHash()(ref);
|
|
}
|
|
|
|
ColumnUInt64::MutablePtr calcHashes() const;
|
|
|
|
UInt64 getIndexImpl(StringRef data) const;
|
|
};
|
|
|
|
|
|
template <typename IndexType, typename ColumnType>
|
|
void ReverseIndex<IndexType, ColumnType>:: setColumn(ColumnType * column_)
|
|
{
|
|
if (column != column_)
|
|
{
|
|
index = nullptr;
|
|
saved_hash = nullptr;
|
|
}
|
|
|
|
column = column_;
|
|
}
|
|
|
|
template <typename IndexType, typename ColumnType>
|
|
size_t ReverseIndex<IndexType, ColumnType>::size() const
|
|
{
|
|
if (!column)
|
|
throw Exception("ReverseIndex has not size because index column wasn't set.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
return column->size();
|
|
}
|
|
|
|
template <typename IndexType, typename ColumnType>
|
|
void ReverseIndex<IndexType, ColumnType>::buildIndex()
|
|
{
|
|
if (index)
|
|
return;
|
|
|
|
if (!column)
|
|
throw Exception("ReverseIndex can't build index because index column wasn't set.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
auto size = column->size();
|
|
index = std::make_unique<IndexMapType>(size);
|
|
|
|
if constexpr (use_saved_hash)
|
|
saved_hash = calcHashes();
|
|
|
|
auto & state = index->getState();
|
|
state.index_column = column;
|
|
state.base_index = base_index;
|
|
if constexpr (use_saved_hash)
|
|
state.saved_hash_column = &saved_hash->getData();
|
|
|
|
using IteratorType = typename IndexMapType::iterator;
|
|
IteratorType iterator;
|
|
bool inserted;
|
|
|
|
for (auto row : collections::range(num_prefix_rows_to_skip, size))
|
|
{
|
|
UInt64 hash;
|
|
if constexpr (use_saved_hash)
|
|
hash = saved_hash->getElement(row);
|
|
else
|
|
hash = getHash(column->getDataAt(row));
|
|
|
|
index->reverseIndexEmplace(row + base_index, iterator, inserted, hash, column->getDataAt(row));
|
|
|
|
if (!inserted)
|
|
throw Exception("Duplicating keys found in ReverseIndex.", ErrorCodes::LOGICAL_ERROR);
|
|
}
|
|
}
|
|
|
|
template <typename IndexType, typename ColumnType>
|
|
ColumnUInt64::MutablePtr ReverseIndex<IndexType, ColumnType>::calcHashes() const
|
|
{
|
|
if (!column)
|
|
throw Exception("ReverseIndex can't build index because index column wasn't set.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
auto size = column->size();
|
|
auto hash = ColumnUInt64::create(size);
|
|
|
|
for (auto row : collections::range(0, size))
|
|
hash->getElement(row) = getHash(column->getDataAt(row));
|
|
|
|
return hash;
|
|
}
|
|
|
|
template <typename IndexType, typename ColumnType>
|
|
UInt64 ReverseIndex<IndexType, ColumnType>::insert(const StringRef & data)
|
|
{
|
|
if (!index)
|
|
buildIndex();
|
|
|
|
using IteratorType = typename IndexMapType::iterator;
|
|
IteratorType iterator;
|
|
bool inserted;
|
|
|
|
auto hash = getHash(data);
|
|
UInt64 num_rows = size();
|
|
|
|
if constexpr (use_saved_hash)
|
|
{
|
|
auto & column_data = saved_hash->getData();
|
|
if (column_data.size() <= num_rows)
|
|
column_data.resize(num_rows + 1);
|
|
column_data[num_rows] = hash;
|
|
}
|
|
else
|
|
column->insertData(data.data, data.size);
|
|
|
|
index->reverseIndexEmplace(num_rows + base_index, iterator, inserted, hash, data);
|
|
|
|
if constexpr (use_saved_hash)
|
|
{
|
|
if (inserted)
|
|
column->insertData(data.data, data.size);
|
|
}
|
|
else
|
|
{
|
|
if (!inserted)
|
|
column->popBack(1);
|
|
}
|
|
|
|
return iterator->getValue();
|
|
}
|
|
|
|
template <typename IndexType, typename ColumnType>
|
|
UInt64 ReverseIndex<IndexType, ColumnType>::getIndexImpl(StringRef data) const
|
|
{
|
|
using IteratorType = typename IndexMapType::iterator;
|
|
IteratorType iterator;
|
|
|
|
auto hash = getHash(data);
|
|
iterator = index->reverseIndexFind(data, hash);
|
|
|
|
return iterator == index->end() ? size() + base_index : iterator->getValue();
|
|
}
|
|
}
|