diff --git a/src/Common/HashTable/HashTable.h b/src/Common/HashTable/HashTable.h index c3fed54206e..2d5580bf709 100644 --- a/src/Common/HashTable/HashTable.h +++ b/src/Common/HashTable/HashTable.h @@ -966,7 +966,7 @@ public: * https://en.wikipedia.org/wiki/Linear_probing * https://en.wikipedia.org/wiki/Open_addressing * Algorithm without recomputing hash but keep probes difference value (difference of natural cell position and inserted one) - * in cell https://arxiv.org/ftp/arxiv/papers/0909/0909.2547.pdf + * in cell https://arxiv.org/ftp/arxiv/papers/0909/0909.2547.pdf * * Currently we use algorithm with hash recomputing on each step from https://en.wikipedia.org/wiki/Open_addressing */ @@ -998,6 +998,20 @@ public: size_t next_position = erased_key_position; + /** + * During element deletion there is a possibility that the search will be broken for one + * of the following elements, because this place erased_key_position is empty. We will check + * next_element. Consider a sequence from (erased_key_position, next_element], if the + * optimal_position of next_element falls into it, then removing erased_key_position + * will not break search for next_element. + * If optimal_position of the element does not fall into the sequence (erased_key_position, next_element] + * then deleting a erased_key_position will break search for it, so we need to move next_element + * to erased_key_position. Now we have empty place at next_element, so we apply the identical + * procedure for it. + * If an empty element is encoutered then means that there is no more next elements for which we can + * break the search so we can exit. + */ + /// Walk to the right through collision resolution chain and move elements to better positions while (true) { @@ -1014,19 +1028,20 @@ public: if (optimal_position == next_position) continue; + /// Cannot move this element because optimal position is after the freed place + /// The second condition is tricky - if the chain was overlapped before erased_key_position, + /// and the optimal position is actually before in collision resolution chain: + /// + /// [*xn***----------------***] + /// ^^-next elem ^ + /// | | + /// erased elem the optimal position of the next elem + /// + /// so, the next elem should be moved to position of erased elem + /// The case of non overlapping part of chain if (next_position > erased_key_position - /// Cannot move this element because optimal position is after the freed place - /// The second condition is tricky - if the chain was overlapped before erased_key_position, - /// and the optimal position is actually before in collision resolution chain: - /// - /// [*xn***----------------***] - /// ^^-next elem ^ - /// | | - /// erased elem the optimal position of the next elem - /// - /// so, the next elem should be moved to position of erased elem - && (optimal_position > erased_key_position) && (optimal_position < next_position)) + && (optimal_position > erased_key_position) && (optimal_position < next_position)) { continue; } diff --git a/src/Common/tests/gtest_hash_table.cpp b/src/Common/tests/gtest_hash_table.cpp index bd789f27709..41255dcbba1 100644 --- a/src/Common/tests/gtest_hash_table.cpp +++ b/src/Common/tests/gtest_hash_table.cpp @@ -99,35 +99,144 @@ TEST(HashTable, Iteration) TEST(HashTable, Erase) { - using Cont = HashSet, HashTableGrower<1>>; - Cont cont; - - for (size_t i = 0; i < 5000; ++i) { - cont.insert(i); - } + /// Check zero element deletion + using Cont = HashSet, HashTableGrower<4>>; + Cont cont; - for (size_t i = 0; i < 2500; ++i) + cont.insert(0); + + ASSERT_TRUE(cont.find(0) != nullptr && cont.find(0)->getKey() == 0); + + cont.erase(0); + + ASSERT_TRUE(cont.find(0) == nullptr); + } { - cont.erase(i); - } + using Cont = HashSet, HashTableGrower<4>>; + Cont cont; - for (size_t i = 5000; i < 10000; ++i) + /// [.(1)..............] erase of (1). + cont.insert(1); + + ASSERT_TRUE(cont.find(1) != nullptr && cont.find(1)->getKey() == 1); + + cont.erase(1); + + ASSERT_TRUE(cont.find(1) == nullptr); + } { - cont.insert(i); - } + using Cont = HashSet, HashTableGrower<4>>; + Cont cont; - for (size_t i = 5000; i < 10000; ++i) + /// [.(1)(2)(3)............] erase of (1) does not break search for (2) (3). + cont.insert(1); + cont.insert(2); + cont.insert(3); + cont.erase(1); + + ASSERT_TRUE(cont.find(1) == nullptr); + ASSERT_TRUE(cont.find(2) != nullptr && cont.find(2)->getKey() == 2); + ASSERT_TRUE(cont.find(3) != nullptr && cont.find(3)->getKey() == 3); + + cont.erase(2); + cont.erase(3); + ASSERT_TRUE(cont.find(2) == nullptr); + ASSERT_TRUE(cont.find(3) == nullptr); + ASSERT_EQ(cont.size(), 0); + } { - cont.erase(i); - } + using Cont = HashSet, HashTableGrower<4>>; + Cont cont; - for (size_t i = 2500; i < 5000; ++i) + /// [.(1)(17).............] erase of (1) breaks search for (17) because their natural position is 1. + cont.insert(1); + cont.insert(17); + cont.erase(1); + + ASSERT_TRUE(cont.find(1) == nullptr); + ASSERT_TRUE(cont.find(17) != nullptr && cont.find(17)->getKey() == 17); + } { - cont.erase(i); - } + using Cont = HashSet, HashTableGrower<4>>; + Cont cont; - ASSERT_EQ(cont.size(), 0); + /// [.(1)(2)(3)(17)...........] erase of (2) breaks search for (17) because their natural position is 1. + + cont.insert(1); + cont.insert(2); + cont.insert(3); + cont.insert(17); + cont.erase(2); + + ASSERT_TRUE(cont.find(2) == nullptr); + ASSERT_TRUE(cont.find(1) != nullptr && cont.find(1)->getKey() == 1); + ASSERT_TRUE(cont.find(3) != nullptr && cont.find(3)->getKey() == 3); + ASSERT_TRUE(cont.find(17) != nullptr && cont.find(17)->getKey() == 17); + } + { + using Cont = HashSet, HashTableGrower<4>>; + Cont cont; + + /// [(16)(30)............(14)(15)] erase of (16) breaks search for (30) because their natural position is 14. + cont.insert(14); + cont.insert(15); + cont.insert(16); + cont.insert(30); + cont.erase(16); + + ASSERT_TRUE(cont.find(16) == nullptr); + ASSERT_TRUE(cont.find(14) != nullptr && cont.find(14)->getKey() == 14); + ASSERT_TRUE(cont.find(15) != nullptr && cont.find(15)->getKey() == 15); + ASSERT_TRUE(cont.find(30) != nullptr && cont.find(30)->getKey() == 30); + } + { + using Cont = HashSet, HashTableGrower<4>>; + Cont cont; + + /// [(16)(30)............(14)(15)] erase of (15) breaks search for (30) because their natural position is 14. + cont.insert(14); + cont.insert(15); + cont.insert(16); + cont.insert(30); + cont.erase(15); + + ASSERT_TRUE(cont.find(15) == nullptr); + ASSERT_TRUE(cont.find(14) != nullptr && cont.find(14)->getKey() == 14); + ASSERT_TRUE(cont.find(16) != nullptr && cont.find(16)->getKey() == 16); + ASSERT_TRUE(cont.find(30) != nullptr && cont.find(30)->getKey() == 30); + } + { + using Cont = HashSet, HashTableGrower<1>>; + Cont cont; + + for (size_t i = 0; i < 5000; ++i) + { + cont.insert(i); + } + + for (size_t i = 0; i < 2500; ++i) + { + cont.erase(i); + } + + for (size_t i = 5000; i < 10000; ++i) + { + cont.insert(i); + } + + for (size_t i = 5000; i < 10000; ++i) + { + cont.erase(i); + } + + for (size_t i = 2500; i < 5000; ++i) + { + cont.erase(i); + } + + ASSERT_EQ(cont.size(), 0); + } } TEST(HashTable, SerializationDeserialization)