mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-21 09:10:48 +00:00
Merge pull request #17845 from kitaisreal/space-saving-remove-last-element-from-map-fix
SpaceSaving remove last element from map fix
This commit is contained in:
commit
8df4789113
@ -194,9 +194,6 @@ struct HashTableCell
|
|||||||
/// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table).
|
/// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table).
|
||||||
static constexpr bool need_zero_value_storage = true;
|
static constexpr bool need_zero_value_storage = true;
|
||||||
|
|
||||||
/// Whether the cell is deleted.
|
|
||||||
bool isDeleted() const { return false; }
|
|
||||||
|
|
||||||
/// Set the mapped value, if any (for HashMap), to the corresponding `value`.
|
/// Set the mapped value, if any (for HashMap), to the corresponding `value`.
|
||||||
void setMapped(const value_type & /*value*/) {}
|
void setMapped(const value_type & /*value*/) {}
|
||||||
|
|
||||||
@ -230,6 +227,9 @@ struct HashTableGrower
|
|||||||
UInt8 size_degree = initial_size_degree;
|
UInt8 size_degree = initial_size_degree;
|
||||||
static constexpr auto initial_count = 1ULL << initial_size_degree;
|
static constexpr auto initial_count = 1ULL << initial_size_degree;
|
||||||
|
|
||||||
|
/// If collision resolution chains are contiguous, we can implement erase operation by moving the elements.
|
||||||
|
static constexpr auto performs_linear_probing_with_single_step = true;
|
||||||
|
|
||||||
/// The size of the hash table in the cells.
|
/// The size of the hash table in the cells.
|
||||||
size_t bufSize() const { return 1ULL << size_degree; }
|
size_t bufSize() const { return 1ULL << size_degree; }
|
||||||
|
|
||||||
@ -277,6 +277,9 @@ template <size_t key_bits>
|
|||||||
struct HashTableFixedGrower
|
struct HashTableFixedGrower
|
||||||
{
|
{
|
||||||
static constexpr auto initial_count = 1ULL << key_bits;
|
static constexpr auto initial_count = 1ULL << key_bits;
|
||||||
|
|
||||||
|
static constexpr auto performs_linear_probing_with_single_step = true;
|
||||||
|
|
||||||
size_t bufSize() const { return 1ULL << key_bits; }
|
size_t bufSize() const { return 1ULL << key_bits; }
|
||||||
size_t place(size_t x) const { return x; }
|
size_t place(size_t x) const { return x; }
|
||||||
/// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently.
|
/// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently.
|
||||||
@ -466,7 +469,7 @@ protected:
|
|||||||
*/
|
*/
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
for (; i < old_size; ++i)
|
for (; i < old_size; ++i)
|
||||||
if (!buf[i].isZero(*this) && !buf[i].isDeleted())
|
if (!buf[i].isZero(*this))
|
||||||
reinsert(buf[i], buf[i].getHash(*this));
|
reinsert(buf[i], buf[i].getHash(*this));
|
||||||
|
|
||||||
/** There is also a special case:
|
/** There is also a special case:
|
||||||
@ -477,7 +480,7 @@ protected:
|
|||||||
* after transferring all the elements from the old halves you need to [ o x ]
|
* after transferring all the elements from the old halves you need to [ o x ]
|
||||||
* process tail from the collision resolution chain immediately after it [ o x ]
|
* process tail from the collision resolution chain immediately after it [ o x ]
|
||||||
*/
|
*/
|
||||||
for (; !buf[i].isZero(*this) && !buf[i].isDeleted(); ++i)
|
for (; !buf[i].isZero(*this); ++i)
|
||||||
reinsert(buf[i], buf[i].getHash(*this));
|
reinsert(buf[i], buf[i].getHash(*this));
|
||||||
|
|
||||||
#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
|
#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
|
||||||
@ -829,6 +832,7 @@ protected:
|
|||||||
*/
|
*/
|
||||||
--m_size;
|
--m_size;
|
||||||
buf[place_value].setZero();
|
buf[place_value].setZero();
|
||||||
|
inserted = false;
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -954,6 +958,97 @@ public:
|
|||||||
return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value);
|
return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::enable_if_t<Grower::performs_linear_probing_with_single_step, void>
|
||||||
|
ALWAYS_INLINE erase(const Key & x)
|
||||||
|
{
|
||||||
|
/** Deletion from open addressing hash table without tombstones
|
||||||
|
*
|
||||||
|
* https://en.wikipedia.org/wiki/Linear_probing
|
||||||
|
* https://en.wikipedia.org/wiki/Open_addressing
|
||||||
|
* Algorithm without recomputing hash but keep probes difference value (difference of natural cell position and inserted one)
|
||||||
|
* in cell https://arxiv.org/ftp/arxiv/papers/0909/0909.2547.pdf
|
||||||
|
*
|
||||||
|
* Currently we use algorithm with hash recomputing on each step from https://en.wikipedia.org/wiki/Open_addressing
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (Cell::isZero(x, *this))
|
||||||
|
{
|
||||||
|
if (this->hasZero())
|
||||||
|
{
|
||||||
|
--m_size;
|
||||||
|
this->clearHasZero();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t hash_value = hash(x);
|
||||||
|
size_t erased_key_position = findCell(x, hash_value, grower.place(hash_value));
|
||||||
|
|
||||||
|
/// Key is not found
|
||||||
|
if (buf[erased_key_position].isZero(*this))
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// We need to guarantee loop termination because there will be empty position
|
||||||
|
assert(m_size < grower.bufSize());
|
||||||
|
|
||||||
|
size_t next_position = erased_key_position;
|
||||||
|
|
||||||
|
/// Walk to the right through collision resolution chain and move elements to better positions
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
next_position = grower.next(next_position);
|
||||||
|
|
||||||
|
/// If there's no more elements in the chain
|
||||||
|
if (buf[next_position].isZero(*this))
|
||||||
|
break;
|
||||||
|
|
||||||
|
/// The optimal position of the element in the cell at next_position
|
||||||
|
size_t optimal_position = grower.place(buf[next_position].getHash(*this));
|
||||||
|
|
||||||
|
/// If position of this element is already optimal - proceed to the next element.
|
||||||
|
if (optimal_position == next_position)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/// The case of non overlapping part of chain
|
||||||
|
if (next_position > erased_key_position
|
||||||
|
/// Cannot move this element because optimal position is after the freed place
|
||||||
|
/// The second condition is tricky - if the chain was overlapped before erased_key_position,
|
||||||
|
/// and the optimal position is actually before in collision resolution chain:
|
||||||
|
///
|
||||||
|
/// [*xn***----------------***]
|
||||||
|
/// ^^-next elem ^
|
||||||
|
/// | |
|
||||||
|
/// erased elem the optimal position of the next elem
|
||||||
|
///
|
||||||
|
/// so, the next elem should be moved to position of erased elem
|
||||||
|
&& (optimal_position > erased_key_position) && (optimal_position < next_position))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The case of overlapping chain
|
||||||
|
if (next_position < erased_key_position
|
||||||
|
/// Cannot move this element because optimal position is after the freed place
|
||||||
|
&& ((optimal_position > erased_key_position) || (optimal_position < next_position)))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Move the element to the freed place
|
||||||
|
memcpy(static_cast<void *>(&buf[erased_key_position]), static_cast<void *>(&buf[next_position]), sizeof(Cell));
|
||||||
|
/// Now we have another freed place
|
||||||
|
erased_key_position = next_position;
|
||||||
|
}
|
||||||
|
|
||||||
|
buf[erased_key_position].setZero();
|
||||||
|
--m_size;
|
||||||
|
}
|
||||||
|
|
||||||
bool ALWAYS_INLINE has(const Key & x) const
|
bool ALWAYS_INLINE has(const Key & x) const
|
||||||
{
|
{
|
||||||
if (Cell::isZero(x, *this))
|
if (Cell::isZero(x, *this))
|
||||||
|
@ -353,6 +353,7 @@ private:
|
|||||||
void destroyLastElement()
|
void destroyLastElement()
|
||||||
{
|
{
|
||||||
auto last_element = counter_list.back();
|
auto last_element = counter_list.back();
|
||||||
|
counter_map.erase(last_element->key);
|
||||||
arena.free(last_element->key);
|
arena.free(last_element->key);
|
||||||
delete last_element;
|
delete last_element;
|
||||||
counter_list.pop_back();
|
counter_list.pop_back();
|
||||||
|
@ -10,9 +10,6 @@ target_link_libraries (sip_hash_perf PRIVATE clickhouse_common_io)
|
|||||||
add_executable (auto_array auto_array.cpp)
|
add_executable (auto_array auto_array.cpp)
|
||||||
target_link_libraries (auto_array PRIVATE clickhouse_common_io)
|
target_link_libraries (auto_array PRIVATE clickhouse_common_io)
|
||||||
|
|
||||||
add_executable (hash_table hash_table.cpp)
|
|
||||||
target_link_libraries (hash_table PRIVATE clickhouse_common_io)
|
|
||||||
|
|
||||||
add_executable (small_table small_table.cpp)
|
add_executable (small_table small_table.cpp)
|
||||||
target_link_libraries (small_table PRIVATE clickhouse_common_io)
|
target_link_libraries (small_table PRIVATE clickhouse_common_io)
|
||||||
|
|
||||||
|
210
src/Common/tests/gtest_hash_table.cpp
Normal file
210
src/Common/tests/gtest_hash_table.cpp
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
#include <iomanip>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include <Interpreters/AggregationCommon.h>
|
||||||
|
|
||||||
|
#include <Common/HashTable/HashMap.h>
|
||||||
|
#include <Common/HashTable/HashSet.h>
|
||||||
|
|
||||||
|
#include <IO/ReadBufferFromString.h>
|
||||||
|
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
/// To test dump functionality without using other hashes that can change
|
||||||
|
template <typename T>
|
||||||
|
struct DummyHash
|
||||||
|
{
|
||||||
|
size_t operator()(T key) const { return T(key); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename HashTable>
|
||||||
|
std::set<typename HashTable::value_type> convertToSet(const HashTable& table)
|
||||||
|
{
|
||||||
|
std::set<typename HashTable::value_type> result;
|
||||||
|
|
||||||
|
for (auto v: table)
|
||||||
|
result.emplace(v.getValue());
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
TEST(HashTable, Insert)
|
||||||
|
{
|
||||||
|
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||||
|
|
||||||
|
Cont cont;
|
||||||
|
|
||||||
|
cont.insert(1);
|
||||||
|
cont.insert(2);
|
||||||
|
|
||||||
|
ASSERT_EQ(cont.size(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(HashTable, Emplace)
|
||||||
|
{
|
||||||
|
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||||
|
|
||||||
|
Cont cont;
|
||||||
|
|
||||||
|
Cont::LookupResult it;
|
||||||
|
bool inserted = false;
|
||||||
|
cont.emplace(1, it, inserted);
|
||||||
|
ASSERT_EQ(it->getKey(), 1);
|
||||||
|
ASSERT_EQ(inserted, true);
|
||||||
|
|
||||||
|
cont.emplace(2, it, inserted);
|
||||||
|
ASSERT_EQ(it->getKey(), 2);
|
||||||
|
ASSERT_EQ(inserted, true);
|
||||||
|
|
||||||
|
cont.emplace(1, it, inserted);
|
||||||
|
ASSERT_EQ(it->getKey(), 1);
|
||||||
|
ASSERT_EQ(inserted, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(HashTable, Lookup)
|
||||||
|
{
|
||||||
|
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||||
|
|
||||||
|
Cont cont;
|
||||||
|
|
||||||
|
cont.insert(1);
|
||||||
|
cont.insert(2);
|
||||||
|
|
||||||
|
Cont::LookupResult it = cont.find(1);
|
||||||
|
ASSERT_TRUE(it != nullptr);
|
||||||
|
|
||||||
|
it = cont.find(2);
|
||||||
|
ASSERT_TRUE(it != nullptr);
|
||||||
|
|
||||||
|
it = cont.find(3);
|
||||||
|
ASSERT_TRUE(it == nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(HashTable, Iteration)
|
||||||
|
{
|
||||||
|
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||||
|
|
||||||
|
Cont cont;
|
||||||
|
|
||||||
|
cont.insert(1);
|
||||||
|
cont.insert(2);
|
||||||
|
cont.insert(3);
|
||||||
|
|
||||||
|
std::set<int> expected = {1, 2, 3};
|
||||||
|
std::set<int> actual = convertToSet(cont);
|
||||||
|
|
||||||
|
ASSERT_EQ(actual, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(HashTable, Erase)
|
||||||
|
{
|
||||||
|
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||||
|
Cont cont;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 5000; ++i)
|
||||||
|
{
|
||||||
|
cont.insert(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 2500; ++i)
|
||||||
|
{
|
||||||
|
cont.erase(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 5000; i < 10000; ++i)
|
||||||
|
{
|
||||||
|
cont.insert(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 5000; i < 10000; ++i)
|
||||||
|
{
|
||||||
|
cont.erase(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 2500; i < 5000; ++i)
|
||||||
|
{
|
||||||
|
cont.erase(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_EQ(cont.size(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(HashTable, SerializationDeserialization)
|
||||||
|
{
|
||||||
|
{
|
||||||
|
/// Use dummy hash to make it reproducible if default hash implementation will be changed
|
||||||
|
using Cont = HashSet<int, DummyHash<int>, HashTableGrower<1>>;
|
||||||
|
|
||||||
|
Cont cont;
|
||||||
|
|
||||||
|
cont.insert(1);
|
||||||
|
cont.insert(2);
|
||||||
|
cont.insert(3);
|
||||||
|
|
||||||
|
DB::WriteBufferFromOwnString wb;
|
||||||
|
cont.writeText(wb);
|
||||||
|
|
||||||
|
std::string expected = "3,1,2,3";
|
||||||
|
|
||||||
|
ASSERT_EQ(wb.str(), expected);
|
||||||
|
|
||||||
|
DB::ReadBufferFromString rb(expected);
|
||||||
|
|
||||||
|
Cont deserialized;
|
||||||
|
deserialized.readText(rb);
|
||||||
|
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
||||||
|
|
||||||
|
Cont cont;
|
||||||
|
|
||||||
|
cont.insert(1);
|
||||||
|
cont.insert(2);
|
||||||
|
cont.insert(3);
|
||||||
|
|
||||||
|
DB::WriteBufferFromOwnString wb;
|
||||||
|
cont.write(wb);
|
||||||
|
|
||||||
|
DB::ReadBufferFromString rb(wb.str());
|
||||||
|
|
||||||
|
Cont deserialized;
|
||||||
|
deserialized.read(rb);
|
||||||
|
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
using Cont = HashSet<int, DummyHash<int>, HashTableGrower<1>>;
|
||||||
|
Cont cont;
|
||||||
|
|
||||||
|
DB::WriteBufferFromOwnString wb;
|
||||||
|
cont.writeText(wb);
|
||||||
|
|
||||||
|
std::string expected = "0";
|
||||||
|
ASSERT_EQ(wb.str(), expected);
|
||||||
|
|
||||||
|
DB::ReadBufferFromString rb(expected);
|
||||||
|
|
||||||
|
Cont deserialized;
|
||||||
|
deserialized.readText(rb);
|
||||||
|
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
using Cont = HashSet<DB::UInt128, DB::UInt128TrivialHash>;
|
||||||
|
Cont cont;
|
||||||
|
|
||||||
|
DB::WriteBufferFromOwnString wb;
|
||||||
|
cont.write(wb);
|
||||||
|
|
||||||
|
std::string expected;
|
||||||
|
expected += static_cast<char>(0);
|
||||||
|
|
||||||
|
ASSERT_EQ(wb.str(), expected);
|
||||||
|
|
||||||
|
DB::ReadBufferFromString rb(expected);
|
||||||
|
|
||||||
|
Cont deserialized;
|
||||||
|
deserialized.read(rb);
|
||||||
|
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
|
||||||
|
}
|
||||||
|
}
|
@ -1,50 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <iomanip>
|
|
||||||
|
|
||||||
#include <Interpreters/AggregationCommon.h>
|
|
||||||
|
|
||||||
#include <Common/HashTable/HashMap.h>
|
|
||||||
#include <Common/HashTable/HashSet.h>
|
|
||||||
|
|
||||||
|
|
||||||
int main(int, char **)
|
|
||||||
{
|
|
||||||
{
|
|
||||||
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
|
|
||||||
Cont cont;
|
|
||||||
|
|
||||||
cont.insert(1);
|
|
||||||
cont.insert(2);
|
|
||||||
|
|
||||||
Cont::LookupResult it;
|
|
||||||
bool inserted;
|
|
||||||
int key = 3;
|
|
||||||
cont.emplace(key, it, inserted);
|
|
||||||
std::cerr << inserted << ", " << key << std::endl;
|
|
||||||
|
|
||||||
cont.emplace(key, it, inserted);
|
|
||||||
std::cerr << inserted << ", " << key << std::endl;
|
|
||||||
|
|
||||||
for (auto x : cont)
|
|
||||||
std::cerr << x.getValue() << std::endl;
|
|
||||||
|
|
||||||
DB::WriteBufferFromOwnString wb;
|
|
||||||
cont.writeText(wb);
|
|
||||||
|
|
||||||
std::cerr << "dump: " << wb.str() << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
using Cont = HashSet<
|
|
||||||
DB::UInt128,
|
|
||||||
DB::UInt128TrivialHash>;
|
|
||||||
Cont cont;
|
|
||||||
|
|
||||||
DB::WriteBufferFromOwnString wb;
|
|
||||||
cont.write(wb);
|
|
||||||
|
|
||||||
std::cerr << "dump: " << wb.str() << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -1 +1,8 @@
|
|||||||
[0,1,2,3,4,5,6,7,8,9]
|
[0,1,2,3,4,5,6,7,8,9]
|
||||||
|
0 [[],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5']]]
|
||||||
|
1 [[[]],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5'],[NULL,'1','2','3','4','5','6']]]
|
||||||
|
2 [[[],[NULL]],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5'],[NULL,'1','2','3','4','5','6'],[NULL,'1','2','3','4','5','6','7']]]
|
||||||
|
3 [[[],[NULL],[NULL,'1']]]
|
||||||
|
4 [[[],[NULL],[NULL,'1'],[NULL,'1','2']]]
|
||||||
|
5 [[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3']]]
|
||||||
|
6 [[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4']]]
|
||||||
|
@ -1 +1,15 @@
|
|||||||
SELECT topK(10)(n) FROM (SELECT if(number % 100 < 10, number % 10, number) AS n FROM system.numbers LIMIT 100000);
|
SELECT topK(10)(n) FROM (SELECT if(number % 100 < 10, number % 10, number) AS n FROM system.numbers LIMIT 100000);
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
k,
|
||||||
|
topK(v)
|
||||||
|
FROM
|
||||||
|
(
|
||||||
|
SELECT
|
||||||
|
number % 7 AS k,
|
||||||
|
arrayMap(x -> arrayMap(x -> if(x = 0, NULL, toString(x)), range(x)), range(intDiv(number, 1))) AS v
|
||||||
|
FROM system.numbers
|
||||||
|
LIMIT 10
|
||||||
|
)
|
||||||
|
GROUP BY k
|
||||||
|
ORDER BY k ASC
|
||||||
|
Loading…
Reference in New Issue
Block a user