diff --git a/src/Common/HashTable/HashTable.h b/src/Common/HashTable/HashTable.h index a569b1c15db..c3fed54206e 100644 --- a/src/Common/HashTable/HashTable.h +++ b/src/Common/HashTable/HashTable.h @@ -194,9 +194,6 @@ struct HashTableCell /// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table). static constexpr bool need_zero_value_storage = true; - /// Whether the cell is deleted. - bool isDeleted() const { return false; } - /// Set the mapped value, if any (for HashMap), to the corresponding `value`. void setMapped(const value_type & /*value*/) {} @@ -230,6 +227,9 @@ struct HashTableGrower UInt8 size_degree = initial_size_degree; static constexpr auto initial_count = 1ULL << initial_size_degree; + /// If collision resolution chains are contiguous, we can implement erase operation by moving the elements. + static constexpr auto performs_linear_probing_with_single_step = true; + /// The size of the hash table in the cells. size_t bufSize() const { return 1ULL << size_degree; } @@ -277,6 +277,9 @@ template struct HashTableFixedGrower { static constexpr auto initial_count = 1ULL << key_bits; + + static constexpr auto performs_linear_probing_with_single_step = true; + size_t bufSize() const { return 1ULL << key_bits; } size_t place(size_t x) const { return x; } /// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently. @@ -466,7 +469,7 @@ protected: */ size_t i = 0; for (; i < old_size; ++i) - if (!buf[i].isZero(*this) && !buf[i].isDeleted()) + if (!buf[i].isZero(*this)) reinsert(buf[i], buf[i].getHash(*this)); /** There is also a special case: @@ -477,7 +480,7 @@ protected: * after transferring all the elements from the old halves you need to [ o x ] * process tail from the collision resolution chain immediately after it [ o x ] */ - for (; !buf[i].isZero(*this) && !buf[i].isDeleted(); ++i) + for (; !buf[i].isZero(*this); ++i) reinsert(buf[i], buf[i].getHash(*this)); #ifdef DBMS_HASH_MAP_DEBUG_RESIZES @@ -829,6 +832,7 @@ protected: */ --m_size; buf[place_value].setZero(); + inserted = false; throw; } @@ -954,6 +958,97 @@ public: return const_cast *>(this)->find(x, hash_value); } + std::enable_if_t + ALWAYS_INLINE erase(const Key & x) + { + /** Deletion from open addressing hash table without tombstones + * + * https://en.wikipedia.org/wiki/Linear_probing + * https://en.wikipedia.org/wiki/Open_addressing + * Algorithm without recomputing hash but keep probes difference value (difference of natural cell position and inserted one) + * in cell https://arxiv.org/ftp/arxiv/papers/0909/0909.2547.pdf + * + * Currently we use algorithm with hash recomputing on each step from https://en.wikipedia.org/wiki/Open_addressing + */ + + if (Cell::isZero(x, *this)) + { + if (this->hasZero()) + { + --m_size; + this->clearHasZero(); + } + else + { + return; + } + } + + size_t hash_value = hash(x); + size_t erased_key_position = findCell(x, hash_value, grower.place(hash_value)); + + /// Key is not found + if (buf[erased_key_position].isZero(*this)) + { + return; + } + + /// We need to guarantee loop termination because there will be empty position + assert(m_size < grower.bufSize()); + + size_t next_position = erased_key_position; + + /// Walk to the right through collision resolution chain and move elements to better positions + while (true) + { + next_position = grower.next(next_position); + + /// If there's no more elements in the chain + if (buf[next_position].isZero(*this)) + break; + + /// The optimal position of the element in the cell at next_position + size_t optimal_position = grower.place(buf[next_position].getHash(*this)); + + /// If position of this element is already optimal - proceed to the next element. + if (optimal_position == next_position) + continue; + + /// The case of non overlapping part of chain + if (next_position > erased_key_position + /// Cannot move this element because optimal position is after the freed place + /// The second condition is tricky - if the chain was overlapped before erased_key_position, + /// and the optimal position is actually before in collision resolution chain: + /// + /// [*xn***----------------***] + /// ^^-next elem ^ + /// | | + /// erased elem the optimal position of the next elem + /// + /// so, the next elem should be moved to position of erased elem + && (optimal_position > erased_key_position) && (optimal_position < next_position)) + { + continue; + } + + /// The case of overlapping chain + if (next_position < erased_key_position + /// Cannot move this element because optimal position is after the freed place + && ((optimal_position > erased_key_position) || (optimal_position < next_position))) + { + continue; + } + + /// Move the element to the freed place + memcpy(static_cast(&buf[erased_key_position]), static_cast(&buf[next_position]), sizeof(Cell)); + /// Now we have another freed place + erased_key_position = next_position; + } + + buf[erased_key_position].setZero(); + --m_size; + } + bool ALWAYS_INLINE has(const Key & x) const { if (Cell::isZero(x, *this)) diff --git a/src/Common/SpaceSaving.h b/src/Common/SpaceSaving.h index cb6fee1ad91..185b4aa90ae 100644 --- a/src/Common/SpaceSaving.h +++ b/src/Common/SpaceSaving.h @@ -353,6 +353,7 @@ private: void destroyLastElement() { auto last_element = counter_list.back(); + counter_map.erase(last_element->key); arena.free(last_element->key); delete last_element; counter_list.pop_back(); diff --git a/src/Common/tests/CMakeLists.txt b/src/Common/tests/CMakeLists.txt index 6a39c2f8553..cb36e2b97d2 100644 --- a/src/Common/tests/CMakeLists.txt +++ b/src/Common/tests/CMakeLists.txt @@ -10,9 +10,6 @@ target_link_libraries (sip_hash_perf PRIVATE clickhouse_common_io) add_executable (auto_array auto_array.cpp) target_link_libraries (auto_array PRIVATE clickhouse_common_io) -add_executable (hash_table hash_table.cpp) -target_link_libraries (hash_table PRIVATE clickhouse_common_io) - add_executable (small_table small_table.cpp) target_link_libraries (small_table PRIVATE clickhouse_common_io) diff --git a/src/Common/tests/gtest_hash_table.cpp b/src/Common/tests/gtest_hash_table.cpp new file mode 100644 index 00000000000..bd789f27709 --- /dev/null +++ b/src/Common/tests/gtest_hash_table.cpp @@ -0,0 +1,210 @@ +#include +#include + +#include + +#include +#include + +#include + +#include + +/// To test dump functionality without using other hashes that can change +template +struct DummyHash +{ + size_t operator()(T key) const { return T(key); } +}; + +template +std::set convertToSet(const HashTable& table) +{ + std::set result; + + for (auto v: table) + result.emplace(v.getValue()); + + return result; +} + + +TEST(HashTable, Insert) +{ + using Cont = HashSet, HashTableGrower<1>>; + + Cont cont; + + cont.insert(1); + cont.insert(2); + + ASSERT_EQ(cont.size(), 2); +} + +TEST(HashTable, Emplace) +{ + using Cont = HashSet, HashTableGrower<1>>; + + Cont cont; + + Cont::LookupResult it; + bool inserted = false; + cont.emplace(1, it, inserted); + ASSERT_EQ(it->getKey(), 1); + ASSERT_EQ(inserted, true); + + cont.emplace(2, it, inserted); + ASSERT_EQ(it->getKey(), 2); + ASSERT_EQ(inserted, true); + + cont.emplace(1, it, inserted); + ASSERT_EQ(it->getKey(), 1); + ASSERT_EQ(inserted, false); +} + +TEST(HashTable, Lookup) +{ + using Cont = HashSet, HashTableGrower<1>>; + + Cont cont; + + cont.insert(1); + cont.insert(2); + + Cont::LookupResult it = cont.find(1); + ASSERT_TRUE(it != nullptr); + + it = cont.find(2); + ASSERT_TRUE(it != nullptr); + + it = cont.find(3); + ASSERT_TRUE(it == nullptr); +} + +TEST(HashTable, Iteration) +{ + using Cont = HashSet, HashTableGrower<1>>; + + Cont cont; + + cont.insert(1); + cont.insert(2); + cont.insert(3); + + std::set expected = {1, 2, 3}; + std::set actual = convertToSet(cont); + + ASSERT_EQ(actual, expected); +} + +TEST(HashTable, Erase) +{ + using Cont = HashSet, HashTableGrower<1>>; + Cont cont; + + for (size_t i = 0; i < 5000; ++i) + { + cont.insert(i); + } + + for (size_t i = 0; i < 2500; ++i) + { + cont.erase(i); + } + + for (size_t i = 5000; i < 10000; ++i) + { + cont.insert(i); + } + + for (size_t i = 5000; i < 10000; ++i) + { + cont.erase(i); + } + + for (size_t i = 2500; i < 5000; ++i) + { + cont.erase(i); + } + + ASSERT_EQ(cont.size(), 0); +} + +TEST(HashTable, SerializationDeserialization) +{ + { + /// Use dummy hash to make it reproducible if default hash implementation will be changed + using Cont = HashSet, HashTableGrower<1>>; + + Cont cont; + + cont.insert(1); + cont.insert(2); + cont.insert(3); + + DB::WriteBufferFromOwnString wb; + cont.writeText(wb); + + std::string expected = "3,1,2,3"; + + ASSERT_EQ(wb.str(), expected); + + DB::ReadBufferFromString rb(expected); + + Cont deserialized; + deserialized.readText(rb); + ASSERT_EQ(convertToSet(cont), convertToSet(deserialized)); + } + { + using Cont = HashSet, HashTableGrower<1>>; + + Cont cont; + + cont.insert(1); + cont.insert(2); + cont.insert(3); + + DB::WriteBufferFromOwnString wb; + cont.write(wb); + + DB::ReadBufferFromString rb(wb.str()); + + Cont deserialized; + deserialized.read(rb); + ASSERT_EQ(convertToSet(cont), convertToSet(deserialized)); + } + { + using Cont = HashSet, HashTableGrower<1>>; + Cont cont; + + DB::WriteBufferFromOwnString wb; + cont.writeText(wb); + + std::string expected = "0"; + ASSERT_EQ(wb.str(), expected); + + DB::ReadBufferFromString rb(expected); + + Cont deserialized; + deserialized.readText(rb); + ASSERT_EQ(convertToSet(cont), convertToSet(deserialized)); + } + { + using Cont = HashSet; + Cont cont; + + DB::WriteBufferFromOwnString wb; + cont.write(wb); + + std::string expected; + expected += static_cast(0); + + ASSERT_EQ(wb.str(), expected); + + DB::ReadBufferFromString rb(expected); + + Cont deserialized; + deserialized.read(rb); + ASSERT_EQ(convertToSet(cont), convertToSet(deserialized)); + } +} diff --git a/src/Common/tests/hash_table.cpp b/src/Common/tests/hash_table.cpp deleted file mode 100644 index ebc22c5b5e5..00000000000 --- a/src/Common/tests/hash_table.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include -#include - -#include - -#include -#include - - -int main(int, char **) -{ - { - using Cont = HashSet, HashTableGrower<1>>; - Cont cont; - - cont.insert(1); - cont.insert(2); - - Cont::LookupResult it; - bool inserted; - int key = 3; - cont.emplace(key, it, inserted); - std::cerr << inserted << ", " << key << std::endl; - - cont.emplace(key, it, inserted); - std::cerr << inserted << ", " << key << std::endl; - - for (auto x : cont) - std::cerr << x.getValue() << std::endl; - - DB::WriteBufferFromOwnString wb; - cont.writeText(wb); - - std::cerr << "dump: " << wb.str() << std::endl; - } - - { - using Cont = HashSet< - DB::UInt128, - DB::UInt128TrivialHash>; - Cont cont; - - DB::WriteBufferFromOwnString wb; - cont.write(wb); - - std::cerr << "dump: " << wb.str() << std::endl; - } - - return 0; -} diff --git a/tests/queries/0_stateless/00453_top_k.reference b/tests/queries/0_stateless/00453_top_k.reference index 1a768b03965..14beb3273fa 100644 --- a/tests/queries/0_stateless/00453_top_k.reference +++ b/tests/queries/0_stateless/00453_top_k.reference @@ -1 +1,8 @@ [0,1,2,3,4,5,6,7,8,9] +0 [[],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5']]] +1 [[[]],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5'],[NULL,'1','2','3','4','5','6']]] +2 [[[],[NULL]],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5'],[NULL,'1','2','3','4','5','6'],[NULL,'1','2','3','4','5','6','7']]] +3 [[[],[NULL],[NULL,'1']]] +4 [[[],[NULL],[NULL,'1'],[NULL,'1','2']]] +5 [[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3']]] +6 [[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4']]] diff --git a/tests/queries/0_stateless/00453_top_k.sql b/tests/queries/0_stateless/00453_top_k.sql index 1f79a8c5393..fb3b88e29e4 100644 --- a/tests/queries/0_stateless/00453_top_k.sql +++ b/tests/queries/0_stateless/00453_top_k.sql @@ -1 +1,15 @@ -SELECT topK(10)(n) FROM (SELECT if(number % 100 < 10, number % 10, number) AS n FROM system.numbers LIMIT 100000); \ No newline at end of file +SELECT topK(10)(n) FROM (SELECT if(number % 100 < 10, number % 10, number) AS n FROM system.numbers LIMIT 100000); + +SELECT + k, + topK(v) +FROM +( + SELECT + number % 7 AS k, + arrayMap(x -> arrayMap(x -> if(x = 0, NULL, toString(x)), range(x)), range(intDiv(number, 1))) AS v + FROM system.numbers + LIMIT 10 +) +GROUP BY k +ORDER BY k ASC