mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-21 09:10:48 +00:00
Merge pull request #17845 from kitaisreal/space-saving-remove-last-element-from-map-fix
SpaceSaving remove last element from map fix
This commit is contained in:
commit
8df4789113
@ -194,9 +194,6 @@ struct HashTableCell
|
||||
/// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table).
|
||||
static constexpr bool need_zero_value_storage = true;
|
||||
|
||||
/// Whether the cell is deleted.
|
||||
bool isDeleted() const { return false; }
|
||||
|
||||
/// Set the mapped value, if any (for HashMap), to the corresponding `value`.
|
||||
void setMapped(const value_type & /*value*/) {}
|
||||
|
||||
@ -230,6 +227,9 @@ struct HashTableGrower
|
||||
UInt8 size_degree = initial_size_degree;
|
||||
static constexpr auto initial_count = 1ULL << initial_size_degree;
|
||||
|
||||
/// If collision resolution chains are contiguous, we can implement erase operation by moving the elements.
|
||||
static constexpr auto performs_linear_probing_with_single_step = true;
|
||||
|
||||
/// The size of the hash table in the cells.
|
||||
size_t bufSize() const { return 1ULL << size_degree; }
|
||||
|
||||
@ -277,6 +277,9 @@ template <size_t key_bits>
|
||||
struct HashTableFixedGrower
|
||||
{
|
||||
static constexpr auto initial_count = 1ULL << key_bits;
|
||||
|
||||
static constexpr auto performs_linear_probing_with_single_step = true;
|
||||
|
||||
size_t bufSize() const { return 1ULL << key_bits; }
|
||||
size_t place(size_t x) const { return x; }
|
||||
/// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently.
|
||||
@ -466,7 +469,7 @@ protected:
|
||||
*/
|
||||
size_t i = 0;
|
||||
for (; i < old_size; ++i)
|
||||
if (!buf[i].isZero(*this) && !buf[i].isDeleted())
|
||||
if (!buf[i].isZero(*this))
|
||||
reinsert(buf[i], buf[i].getHash(*this));
|
||||
|
||||
/** There is also a special case:
|
||||
@ -477,7 +480,7 @@ protected:
|
||||
* after transferring all the elements from the old halves you need to [ o x ]
|
||||
* process tail from the collision resolution chain immediately after it [ o x ]
|
||||
*/
|
||||
for (; !buf[i].isZero(*this) && !buf[i].isDeleted(); ++i)
|
||||
for (; !buf[i].isZero(*this); ++i)
|
||||
reinsert(buf[i], buf[i].getHash(*this));
|
||||
|
||||
#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
|
||||
@ -829,6 +832,7 @@ protected:
|
||||
*/
|
||||
--m_size;
|
||||
buf[place_value].setZero();
|
||||
inserted = false;
|
||||
throw;
|
||||
}
|
||||
|
||||
@ -954,6 +958,97 @@ public:
|
||||
return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value);
|
||||
}
|
||||
|
||||
std::enable_if_t<Grower::performs_linear_probing_with_single_step, void>
|
||||
ALWAYS_INLINE erase(const Key & x)
|
||||
{
|
||||
/** Deletion from open addressing hash table without tombstones
|
||||
*
|
||||
* https://en.wikipedia.org/wiki/Linear_probing
|
||||
* https://en.wikipedia.org/wiki/Open_addressing
|
||||
* Algorithm without recomputing hash but keep probes difference value (difference of natural cell position and inserted one)
|
||||
* in cell https://arxiv.org/ftp/arxiv/papers/0909/0909.2547.pdf
|
||||
*
|
||||
* Currently we use algorithm with hash recomputing on each step from https://en.wikipedia.org/wiki/Open_addressing
|
||||
*/
|
||||
|
||||
if (Cell::isZero(x, *this))
|
||||
{
|
||||
if (this->hasZero())
|
||||
{
|
||||
--m_size;
|
||||
this->clearHasZero();
|
||||
}
|
||||
else
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
size_t hash_value = hash(x);
|
||||
size_t erased_key_position = findCell(x, hash_value, grower.place(hash_value));
|
||||
|
||||
/// Key is not found
|
||||
if (buf[erased_key_position].isZero(*this))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/// We need to guarantee loop termination because there will be empty position
|
||||
assert(m_size < grower.bufSize());
|
||||
|
||||
size_t next_position = erased_key_position;
|
||||
|
||||
/// Walk to the right through collision resolution chain and move elements to better positions
|
||||
while (true)
|
||||
{
|
||||
next_position = grower.next(next_position);
|
||||
|
||||
/// If there's no more elements in the chain
|
||||
if (buf[next_position].isZero(*this))
|
||||
break;
|
||||
|
||||
/// The optimal position of the element in the cell at next_position
|
||||
size_t optimal_position = grower.place(buf[next_position].getHash(*this));
|
||||
|
||||
/// If position of this element is already optimal - proceed to the next element.
|
||||
if (optimal_position == next_position)
|
||||
continue;
|
||||
|
||||
/// The case of non overlapping part of chain
|
||||
if (next_position > erased_key_position
|
||||
/// Cannot move this element because optimal position is after the freed place
|
||||
/// The second condition is tricky - if the chain was overlapped before erased_key_position,
|
||||
/// and the optimal position is actually before in collision resolution chain:
|
||||
///
|
||||
/// [*xn***----------------***]
|
||||
/// ^^-next elem ^
|
||||
/// | |
|
||||
/// erased elem the optimal position of the next elem
|
||||
///
|
||||
/// so, the next elem should be moved to position of erased elem
|
||||
&& (optimal_position > erased_key_position) && (optimal_position < next_position))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
/// The case of overlapping chain
|
||||
if (next_position < erased_key_position
|
||||
/// Cannot move this element because optimal position is after the freed place
|
||||
&& ((optimal_position > erased_key_position) || (optimal_position < next_position)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
/// Move the element to the freed place
|
||||
memcpy(static_cast<void *>(&buf[erased_key_position]), static_cast<void *>(&buf[next_position]), sizeof(Cell));
|
||||
/// Now we have another freed place
|
||||
erased_key_position = next_position;
|
||||
}
|
||||
|
||||
buf[erased_key_position].setZero();
|
||||
--m_size;
|
||||
}
|
||||
|
||||
bool ALWAYS_INLINE has(const Key & x) const
|
||||
{
|
||||
if (Cell::isZero(x, *this))
|
||||
|
@ -353,6 +353,7 @@ private:
|
||||
void destroyLastElement()
|
||||
{
|
||||
auto last_element = counter_list.back();
|
||||
counter_map.erase(last_element->key);
|
||||
arena.free(last_element->key);
|
||||
delete last_element;
|
||||
counter_list.pop_back();
|
||||
|
@ -10,9 +10,6 @@ target_link_libraries (sip_hash_perf PRIVATE clickhouse_common_io)
|
||||
add_executable (auto_array auto_array.cpp)
|
||||
target_link_libraries (auto_array PRIVATE clickhouse_common_io)
|
||||
|
||||
add_executable (hash_table hash_table.cpp)
|
||||
target_link_libraries (hash_table PRIVATE clickhouse_common_io)
|
||||
|
||||
add_executable (small_table small_table.cpp)
|
||||
target_link_libraries (small_table PRIVATE clickhouse_common_io)
|
||||
|
||||
|
210
src/Common/tests/gtest_hash_table.cpp
Normal file
210
src/Common/tests/gtest_hash_table.cpp
Normal file
@ -0,0 +1,210 @@
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
#include <Interpreters/AggregationCommon.h>
|
||||
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
/// To test dump functionality without using other hashes that can change
|
||||
template <typename T>
|
||||
struct DummyHash
|
||||
{
|
||||
size_t operator()(T key) const { return T(key); }
|
||||
};
|
||||
|
||||
template<typename HashTable>
|
||||
std::set<typename HashTable::value_type> convertToSet(const HashTable& table)
|
||||
{
|
||||
std::set<typename HashTable::value_type> result;
|
||||
|
||||
for (auto v: table)
|
||||
result.emplace(v.getValue());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
TEST(HashTable, Insert)
|
||||
{
|
||||
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||
|
||||
Cont cont;
|
||||
|
||||
cont.insert(1);
|
||||
cont.insert(2);
|
||||
|
||||
ASSERT_EQ(cont.size(), 2);
|
||||
}
|
||||
|
||||
TEST(HashTable, Emplace)
|
||||
{
|
||||
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||
|
||||
Cont cont;
|
||||
|
||||
Cont::LookupResult it;
|
||||
bool inserted = false;
|
||||
cont.emplace(1, it, inserted);
|
||||
ASSERT_EQ(it->getKey(), 1);
|
||||
ASSERT_EQ(inserted, true);
|
||||
|
||||
cont.emplace(2, it, inserted);
|
||||
ASSERT_EQ(it->getKey(), 2);
|
||||
ASSERT_EQ(inserted, true);
|
||||
|
||||
cont.emplace(1, it, inserted);
|
||||
ASSERT_EQ(it->getKey(), 1);
|
||||
ASSERT_EQ(inserted, false);
|
||||
}
|
||||
|
||||
TEST(HashTable, Lookup)
|
||||
{
|
||||
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||
|
||||
Cont cont;
|
||||
|
||||
cont.insert(1);
|
||||
cont.insert(2);
|
||||
|
||||
Cont::LookupResult it = cont.find(1);
|
||||
ASSERT_TRUE(it != nullptr);
|
||||
|
||||
it = cont.find(2);
|
||||
ASSERT_TRUE(it != nullptr);
|
||||
|
||||
it = cont.find(3);
|
||||
ASSERT_TRUE(it == nullptr);
|
||||
}
|
||||
|
||||
TEST(HashTable, Iteration)
|
||||
{
|
||||
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||
|
||||
Cont cont;
|
||||
|
||||
cont.insert(1);
|
||||
cont.insert(2);
|
||||
cont.insert(3);
|
||||
|
||||
std::set<int> expected = {1, 2, 3};
|
||||
std::set<int> actual = convertToSet(cont);
|
||||
|
||||
ASSERT_EQ(actual, expected);
|
||||
}
|
||||
|
||||
TEST(HashTable, Erase)
|
||||
{
|
||||
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||
Cont cont;
|
||||
|
||||
for (size_t i = 0; i < 5000; ++i)
|
||||
{
|
||||
cont.insert(i);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < 2500; ++i)
|
||||
{
|
||||
cont.erase(i);
|
||||
}
|
||||
|
||||
for (size_t i = 5000; i < 10000; ++i)
|
||||
{
|
||||
cont.insert(i);
|
||||
}
|
||||
|
||||
for (size_t i = 5000; i < 10000; ++i)
|
||||
{
|
||||
cont.erase(i);
|
||||
}
|
||||
|
||||
for (size_t i = 2500; i < 5000; ++i)
|
||||
{
|
||||
cont.erase(i);
|
||||
}
|
||||
|
||||
ASSERT_EQ(cont.size(), 0);
|
||||
}
|
||||
|
||||
TEST(HashTable, SerializationDeserialization)
|
||||
{
|
||||
{
|
||||
/// Use dummy hash to make it reproducible if default hash implementation will be changed
|
||||
using Cont = HashSet<int, DummyHash<int>, HashTableGrower<1>>;
|
||||
|
||||
Cont cont;
|
||||
|
||||
cont.insert(1);
|
||||
cont.insert(2);
|
||||
cont.insert(3);
|
||||
|
||||
DB::WriteBufferFromOwnString wb;
|
||||
cont.writeText(wb);
|
||||
|
||||
std::string expected = "3,1,2,3";
|
||||
|
||||
ASSERT_EQ(wb.str(), expected);
|
||||
|
||||
DB::ReadBufferFromString rb(expected);
|
||||
|
||||
Cont deserialized;
|
||||
deserialized.readText(rb);
|
||||
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
|
||||
}
|
||||
{
|
||||
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||
|
||||
Cont cont;
|
||||
|
||||
cont.insert(1);
|
||||
cont.insert(2);
|
||||
cont.insert(3);
|
||||
|
||||
DB::WriteBufferFromOwnString wb;
|
||||
cont.write(wb);
|
||||
|
||||
DB::ReadBufferFromString rb(wb.str());
|
||||
|
||||
Cont deserialized;
|
||||
deserialized.read(rb);
|
||||
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
|
||||
}
|
||||
{
|
||||
using Cont = HashSet<int, DummyHash<int>, HashTableGrower<1>>;
|
||||
Cont cont;
|
||||
|
||||
DB::WriteBufferFromOwnString wb;
|
||||
cont.writeText(wb);
|
||||
|
||||
std::string expected = "0";
|
||||
ASSERT_EQ(wb.str(), expected);
|
||||
|
||||
DB::ReadBufferFromString rb(expected);
|
||||
|
||||
Cont deserialized;
|
||||
deserialized.readText(rb);
|
||||
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
|
||||
}
|
||||
{
|
||||
using Cont = HashSet<DB::UInt128, DB::UInt128TrivialHash>;
|
||||
Cont cont;
|
||||
|
||||
DB::WriteBufferFromOwnString wb;
|
||||
cont.write(wb);
|
||||
|
||||
std::string expected;
|
||||
expected += static_cast<char>(0);
|
||||
|
||||
ASSERT_EQ(wb.str(), expected);
|
||||
|
||||
DB::ReadBufferFromString rb(expected);
|
||||
|
||||
Cont deserialized;
|
||||
deserialized.read(rb);
|
||||
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
|
||||
}
|
||||
}
|
@ -1,50 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
#include <Interpreters/AggregationCommon.h>
|
||||
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
|
||||
|
||||
int main(int, char **)
|
||||
{
|
||||
{
|
||||
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||
Cont cont;
|
||||
|
||||
cont.insert(1);
|
||||
cont.insert(2);
|
||||
|
||||
Cont::LookupResult it;
|
||||
bool inserted;
|
||||
int key = 3;
|
||||
cont.emplace(key, it, inserted);
|
||||
std::cerr << inserted << ", " << key << std::endl;
|
||||
|
||||
cont.emplace(key, it, inserted);
|
||||
std::cerr << inserted << ", " << key << std::endl;
|
||||
|
||||
for (auto x : cont)
|
||||
std::cerr << x.getValue() << std::endl;
|
||||
|
||||
DB::WriteBufferFromOwnString wb;
|
||||
cont.writeText(wb);
|
||||
|
||||
std::cerr << "dump: " << wb.str() << std::endl;
|
||||
}
|
||||
|
||||
{
|
||||
using Cont = HashSet<
|
||||
DB::UInt128,
|
||||
DB::UInt128TrivialHash>;
|
||||
Cont cont;
|
||||
|
||||
DB::WriteBufferFromOwnString wb;
|
||||
cont.write(wb);
|
||||
|
||||
std::cerr << "dump: " << wb.str() << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -1 +1,8 @@
|
||||
[0,1,2,3,4,5,6,7,8,9]
|
||||
0 [[],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5']]]
|
||||
1 [[[]],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5'],[NULL,'1','2','3','4','5','6']]]
|
||||
2 [[[],[NULL]],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5'],[NULL,'1','2','3','4','5','6'],[NULL,'1','2','3','4','5','6','7']]]
|
||||
3 [[[],[NULL],[NULL,'1']]]
|
||||
4 [[[],[NULL],[NULL,'1'],[NULL,'1','2']]]
|
||||
5 [[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3']]]
|
||||
6 [[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4']]]
|
||||
|
@ -1 +1,15 @@
|
||||
SELECT topK(10)(n) FROM (SELECT if(number % 100 < 10, number % 10, number) AS n FROM system.numbers LIMIT 100000);
|
||||
|
||||
SELECT
|
||||
k,
|
||||
topK(v)
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
number % 7 AS k,
|
||||
arrayMap(x -> arrayMap(x -> if(x = 0, NULL, toString(x)), range(x)), range(intDiv(number, 1))) AS v
|
||||
FROM system.numbers
|
||||
LIMIT 10
|
||||
)
|
||||
GROUP BY k
|
||||
ORDER BY k ASC
|
||||
|
Loading…
Reference in New Issue
Block a user