mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 08:32:02 +00:00
Added erase into HashTable
This commit is contained in:
parent
bfceb06659
commit
bf5d75853c
@ -194,9 +194,6 @@ struct HashTableCell
|
||||
/// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table).
|
||||
static constexpr bool need_zero_value_storage = true;
|
||||
|
||||
/// Whether the cell is deleted.
|
||||
bool isDeleted() const { return false; }
|
||||
|
||||
/// Set the mapped value, if any (for HashMap), to the corresponding `value`.
|
||||
void setMapped(const value_type & /*value*/) {}
|
||||
|
||||
@ -230,6 +227,8 @@ struct HashTableGrower
|
||||
UInt8 size_degree = initial_size_degree;
|
||||
static constexpr auto initial_count = 1ULL << initial_size_degree;
|
||||
|
||||
static constexpr auto performs_linear_probing_with_single_step = true;
|
||||
|
||||
/// The size of the hash table in the cells.
|
||||
size_t bufSize() const { return 1ULL << size_degree; }
|
||||
|
||||
@ -277,6 +276,9 @@ template <size_t key_bits>
|
||||
struct HashTableFixedGrower
|
||||
{
|
||||
static constexpr auto initial_count = 1ULL << key_bits;
|
||||
|
||||
static constexpr auto performs_linear_probing_with_single_step = true;
|
||||
|
||||
size_t bufSize() const { return 1ULL << key_bits; }
|
||||
size_t place(size_t x) const { return x; }
|
||||
/// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently.
|
||||
@ -466,7 +468,7 @@ protected:
|
||||
*/
|
||||
size_t i = 0;
|
||||
for (; i < old_size; ++i)
|
||||
if (!buf[i].isZero(*this) && !buf[i].isDeleted())
|
||||
if (!buf[i].isZero(*this))
|
||||
reinsert(buf[i], buf[i].getHash(*this));
|
||||
|
||||
/** There is also a special case:
|
||||
@ -477,7 +479,7 @@ protected:
|
||||
* after transferring all the elements from the old halves you need to [ o x ]
|
||||
* process tail from the collision resolution chain immediately after it [ o x ]
|
||||
*/
|
||||
for (; !buf[i].isZero(*this) && !buf[i].isDeleted(); ++i)
|
||||
for (; !buf[i].isZero(*this); ++i)
|
||||
reinsert(buf[i], buf[i].getHash(*this));
|
||||
|
||||
#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
|
||||
@ -829,6 +831,7 @@ protected:
|
||||
*/
|
||||
--m_size;
|
||||
buf[place_value].setZero();
|
||||
inserted = false;
|
||||
throw;
|
||||
}
|
||||
|
||||
@ -954,6 +957,72 @@ public:
|
||||
return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value);
|
||||
}
|
||||
|
||||
template<typename = std::enable_if<Grower::performs_linear_probing_with_single_step, void>>
|
||||
void ALWAYS_INLINE erase(const Key & x)
|
||||
{
|
||||
/*
|
||||
Deletion of open address hash table without tombstones
|
||||
|
||||
https://en.wikipedia.org/wiki/Linear_probing
|
||||
https://en.wikipedia.org/wiki/Open_addressing
|
||||
Algorithm without recomputing hash but keep probes difference value (difference of natural cell position and inserted one)
|
||||
in cell https://arxiv.org/ftp/arxiv/papers/0909/0909.2547.pdf
|
||||
|
||||
Currently we use algorithm with hash recomputing on each step from https://en.wikipedia.org/wiki/Open_addressing.
|
||||
*/
|
||||
|
||||
if (Cell::isZero(x, *this))
|
||||
{
|
||||
if (this->hasZero())
|
||||
{
|
||||
--m_size;
|
||||
this->clearHasZero();
|
||||
}
|
||||
else
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
size_t hash_value = hash(x);
|
||||
size_t i = findCell(x, hash_value, grower.place(hash_value));
|
||||
|
||||
if (buf[i].isZero(*this))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/// We need to guarantee loop termination because there will be empty position
|
||||
assert(m_size < grower.bufSize());
|
||||
|
||||
size_t j = i;
|
||||
|
||||
while (true)
|
||||
{
|
||||
/// TODO: Modify to remove unnecessary setZero over loop
|
||||
buf[j].setZero();
|
||||
r2:
|
||||
j = grower.next(j);
|
||||
|
||||
if (buf[j].isZero(*this))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
/// If hash recomputing is expensive we can avoid it adding additional value in cell during insertion
|
||||
/// check algorithm link above
|
||||
size_t k = grower.place(buf[j].getHash(*this));
|
||||
|
||||
if (i <= j ? ((i < k) && (k <= j)) : ((i < k) || (k <= j)))
|
||||
goto r2;
|
||||
|
||||
memcpy(static_cast<void *>(&buf[i]), static_cast<void *>(&buf[j]), sizeof(Cell));
|
||||
i = j;
|
||||
}
|
||||
|
||||
--m_size;
|
||||
}
|
||||
|
||||
bool ALWAYS_INLINE has(const Key & x) const
|
||||
{
|
||||
if (Cell::isZero(x, *this))
|
||||
|
@ -353,13 +353,7 @@ private:
|
||||
void destroyLastElement()
|
||||
{
|
||||
auto last_element = counter_list.back();
|
||||
|
||||
if constexpr (std::is_same_v<StringRef, TKey>)
|
||||
{
|
||||
auto last_element_it = counter_map.find(last_element->key, last_element->hash);
|
||||
last_element_it->setZero();
|
||||
}
|
||||
|
||||
counter_map.erase(last_element->key);
|
||||
arena.free(last_element->key);
|
||||
delete last_element;
|
||||
counter_list.pop_back();
|
||||
|
@ -13,6 +13,9 @@ target_link_libraries (auto_array PRIVATE clickhouse_common_io)
|
||||
add_executable (hash_table hash_table.cpp)
|
||||
target_link_libraries (hash_table PRIVATE clickhouse_common_io)
|
||||
|
||||
add_executable (hash_table_erase hash_table_erase.cpp)
|
||||
target_link_libraries (hash_table_erase PRIVATE clickhouse_common_io)
|
||||
|
||||
add_executable (small_table small_table.cpp)
|
||||
target_link_libraries (small_table PRIVATE clickhouse_common_io)
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
#include <Interpreters/AggregationCommon.h>
|
||||
|
||||
@ -25,13 +25,26 @@ int main(int, char **)
|
||||
cont.emplace(key, it, inserted);
|
||||
std::cerr << inserted << ", " << key << std::endl;
|
||||
|
||||
std::cerr << "Before erase" << std::endl;
|
||||
for (auto x : cont)
|
||||
std::cerr << x.getValue() << std::endl;
|
||||
|
||||
DB::WriteBufferFromOwnString wb;
|
||||
cont.writeText(wb);
|
||||
|
||||
std::cerr << "dump: " << wb.str() << std::endl;
|
||||
std::cerr << "Dump before erase: " << wb.str() << std::endl;
|
||||
|
||||
cont.erase(2);
|
||||
cont.erase(3);
|
||||
|
||||
std::cerr << "After erase" << std::endl;
|
||||
for (auto x : cont)
|
||||
std::cerr << x.getValue() << std::endl;
|
||||
|
||||
wb.restart();
|
||||
cont.writeText(wb);
|
||||
|
||||
std::cerr << "Dump after erase: " << wb.str() << std::endl;
|
||||
}
|
||||
|
||||
{
|
||||
|
43
src/Common/tests/hash_table_erase.cpp
Normal file
43
src/Common/tests/hash_table_erase.cpp
Normal file
@ -0,0 +1,43 @@
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
|
||||
|
||||
int main(int, char **)
|
||||
{
|
||||
{
|
||||
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||
Cont cont;
|
||||
|
||||
for (size_t i = 0; i < 5000; ++i)
|
||||
{
|
||||
cont.insert(i);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < 2500; ++i)
|
||||
{
|
||||
cont.erase(i);
|
||||
}
|
||||
|
||||
for (size_t i = 5000; i < 10000; ++i)
|
||||
{
|
||||
cont.insert(i);
|
||||
}
|
||||
|
||||
for (size_t i = 5000; i < 10000; ++i)
|
||||
{
|
||||
cont.erase(i);
|
||||
}
|
||||
|
||||
for (size_t i = 2500; i < 5000; ++i)
|
||||
{
|
||||
cont.erase(i);
|
||||
}
|
||||
|
||||
std::cerr << "size: " << cont.size() << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user