Merge pull request #17845 from kitaisreal/space-saving-remove-last-element-from-map-fix

SpaceSaving remove last element from map fix
2024-09-21 09:10:48 +00:00 · 2020-12-13 04:09:51 +03:00 · 2020-12-13 04:09:51 +03:00 · 8df4789113
commit 8df4789113
parent 04e222f6f3 6f50a86025
7 changed files with 333 additions and 59 deletions
--- a/src/Common/HashTable/HashTable.h
+++ b/src/Common/HashTable/HashTable.h
@ -194,9 +194,6 @@ struct HashTableCell
    /// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table).
    static constexpr bool need_zero_value_storage = true;
    /// Whether the cell is deleted.
    bool isDeleted() const { return false; }
    /// Set the mapped value, if any (for HashMap), to the corresponding `value`.
    void setMapped(const value_type & /*value*/) {}
@ -230,6 +227,9 @@ struct HashTableGrower
    UInt8 size_degree = initial_size_degree;
    static constexpr auto initial_count = 1ULL << initial_size_degree;
    /// If collision resolution chains are contiguous, we can implement erase operation by moving the elements.
    static constexpr auto performs_linear_probing_with_single_step = true;
    /// The size of the hash table in the cells.
    size_t bufSize() const               { return 1ULL << size_degree; }
@ -277,6 +277,9 @@ template <size_t key_bits>
 struct HashTableFixedGrower
 {
    static constexpr auto initial_count = 1ULL << key_bits;
    static constexpr auto performs_linear_probing_with_single_step = true;
    size_t bufSize() const               { return 1ULL << key_bits; }
    size_t place(size_t x) const         { return x; }
    /// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently.
@ -466,7 +469,7 @@ protected:
          */
        size_t i = 0;
        for (; i < old_size; ++i)
-            if (!buf[i].isZero(*this) && !buf[i].isDeleted())
+            if (!buf[i].isZero(*this))
                reinsert(buf[i], buf[i].getHash(*this));
        /** There is also a special case:
@ -477,7 +480,7 @@ protected:
          *    after transferring all the elements from the old halves you need to     [         o   x    ]
          *    process tail from the collision resolution chain immediately after it   [        o    x    ]
          */
-        for (; !buf[i].isZero(*this) && !buf[i].isDeleted(); ++i)
+        for (; !buf[i].isZero(*this); ++i)
            reinsert(buf[i], buf[i].getHash(*this));
 #ifdef DBMS_HASH_MAP_DEBUG_RESIZES
@ -829,6 +832,7 @@ protected:
                  */
                --m_size;
                buf[place_value].setZero();
                inserted = false;
                throw;
            }
@ -954,6 +958,97 @@ public:
        return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value);
    }
    std::enable_if_t<Grower::performs_linear_probing_with_single_step, void>
    ALWAYS_INLINE erase(const Key & x)
    {
        /** Deletion from open addressing hash table without tombstones
          *
          * https://en.wikipedia.org/wiki/Linear_probing
          * https://en.wikipedia.org/wiki/Open_addressing
          * Algorithm without recomputing hash but keep probes difference value (difference of natural cell position and inserted one)
          *  in cell https://arxiv.org/ftp/arxiv/papers/0909/0909.2547.pdf
          *
          * Currently we use algorithm with hash recomputing on each step from https://en.wikipedia.org/wiki/Open_addressing
          */
        if (Cell::isZero(x, *this))
        {
            if (this->hasZero())
            {
                --m_size;
                this->clearHasZero();
            }
            else
            {
                return;
            }
        }
        size_t hash_value = hash(x);
        size_t erased_key_position = findCell(x, hash_value, grower.place(hash_value));
        /// Key is not found
        if (buf[erased_key_position].isZero(*this))
        {
            return;
        }
        /// We need to guarantee loop termination because there will be empty position
        assert(m_size < grower.bufSize());
        size_t next_position = erased_key_position;
        /// Walk to the right through collision resolution chain and move elements to better positions
        while (true)
        {
            next_position = grower.next(next_position);
            /// If there's no more elements in the chain
            if (buf[next_position].isZero(*this))
                break;
            /// The optimal position of the element in the cell at next_position
            size_t optimal_position = grower.place(buf[next_position].getHash(*this));
            /// If position of this element is already optimal - proceed to the next element.
            if (optimal_position == next_position)
                continue;
            /// The case of non overlapping part of chain
            if (next_position > erased_key_position
                /// Cannot move this element because optimal position is after the freed place
                /// The second condition is tricky - if the chain was overlapped before erased_key_position,
                ///  and the optimal position is actually before in collision resolution chain:
                ///
                /// [*xn***----------------***]
                ///   ^^-next elem          ^
                ///   |                     |
                ///   erased elem           the optimal position of the next elem
                ///
                /// so, the next elem should be moved to position of erased elem
                && (optimal_position > erased_key_position) && (optimal_position < next_position))
            {
                continue;
            }
            /// The case of overlapping chain
            if (next_position < erased_key_position
                /// Cannot move this element because optimal position is after the freed place
                && ((optimal_position > erased_key_position) || (optimal_position < next_position)))
            {
                continue;
            }
            /// Move the element to the freed place
            memcpy(static_cast<void *>(&buf[erased_key_position]), static_cast<void *>(&buf[next_position]), sizeof(Cell));
            /// Now we have another freed place
            erased_key_position = next_position;
        }
        buf[erased_key_position].setZero();
        --m_size;
    }
    bool ALWAYS_INLINE has(const Key & x) const
    {
        if (Cell::isZero(x, *this))
--- a/src/Common/SpaceSaving.h
+++ b/src/Common/SpaceSaving.h
@ -353,6 +353,7 @@ private:
    void destroyLastElement()
    {
        auto last_element = counter_list.back();
        counter_map.erase(last_element->key);
        arena.free(last_element->key);
        delete last_element;
        counter_list.pop_back();
--- a/src/Common/tests/CMakeLists.txt
+++ b/src/Common/tests/CMakeLists.txt
@ -10,9 +10,6 @@ target_link_libraries (sip_hash_perf PRIVATE clickhouse_common_io)
 add_executable (auto_array auto_array.cpp)
 target_link_libraries (auto_array PRIVATE clickhouse_common_io)
 add_executable (hash_table hash_table.cpp)
 target_link_libraries (hash_table PRIVATE clickhouse_common_io)
 add_executable (small_table small_table.cpp)
 target_link_libraries (small_table PRIVATE clickhouse_common_io)
--- a/src/Common/tests/gtest_hash_table.cpp
+++ b/src/Common/tests/gtest_hash_table.cpp
@ -0,0 +1,210 @@
 #include <iomanip>
 #include <iostream>
 #include <Interpreters/AggregationCommon.h>
 #include <Common/HashTable/HashMap.h>
 #include <Common/HashTable/HashSet.h>
 #include <IO/ReadBufferFromString.h>
 #include <gtest/gtest.h>
 /// To test dump functionality without using other hashes that can change
 template <typename T>
 struct DummyHash
 {
    size_t operator()(T key) const { return T(key); }
 };
 template<typename HashTable>
 std::set<typename HashTable::value_type> convertToSet(const HashTable& table)
 {
    std::set<typename HashTable::value_type> result;
    for (auto v: table)
        result.emplace(v.getValue());
    return result;
 }
 TEST(HashTable, Insert)
 {
    using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
    Cont cont;
    cont.insert(1);
    cont.insert(2);
    ASSERT_EQ(cont.size(), 2);
 }
 TEST(HashTable, Emplace)
 {
    using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
    Cont cont;
    Cont::LookupResult it;
    bool inserted = false;
    cont.emplace(1, it, inserted);
    ASSERT_EQ(it->getKey(), 1);
    ASSERT_EQ(inserted, true);
    cont.emplace(2, it, inserted);
    ASSERT_EQ(it->getKey(), 2);
    ASSERT_EQ(inserted, true);
    cont.emplace(1, it, inserted);
    ASSERT_EQ(it->getKey(), 1);
    ASSERT_EQ(inserted, false);
 }
 TEST(HashTable, Lookup)
 {
    using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
    Cont cont;
    cont.insert(1);
    cont.insert(2);
    Cont::LookupResult it = cont.find(1);
    ASSERT_TRUE(it != nullptr);
    it = cont.find(2);
    ASSERT_TRUE(it != nullptr);
    it = cont.find(3);
    ASSERT_TRUE(it == nullptr);
 }
 TEST(HashTable, Iteration)
 {
    using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
    Cont cont;
    cont.insert(1);
    cont.insert(2);
    cont.insert(3);
    std::set<int> expected = {1, 2, 3};
    std::set<int> actual = convertToSet(cont);
    ASSERT_EQ(actual, expected);
 }
 TEST(HashTable, Erase)
 {
    using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
    Cont cont;
    for (size_t i = 0; i < 5000; ++i)
    {
        cont.insert(i);
    }
    for (size_t i = 0; i < 2500; ++i)
    {
        cont.erase(i);
    }
    for (size_t i = 5000; i < 10000; ++i)
    {
        cont.insert(i);
    }
    for (size_t i = 5000; i < 10000; ++i)
    {
        cont.erase(i);
    }
    for (size_t i = 2500; i < 5000; ++i)
    {
        cont.erase(i);
    }
    ASSERT_EQ(cont.size(), 0);
 }
 TEST(HashTable, SerializationDeserialization)
 {
    {
        /// Use dummy hash to make it reproducible if default hash implementation will be changed
        using Cont = HashSet<int, DummyHash<int>, HashTableGrower<1>>;
        Cont cont;
        cont.insert(1);
        cont.insert(2);
        cont.insert(3);
        DB::WriteBufferFromOwnString wb;
        cont.writeText(wb);
        std::string expected = "3,1,2,3";
        ASSERT_EQ(wb.str(), expected);
        DB::ReadBufferFromString rb(expected);
        Cont deserialized;
        deserialized.readText(rb);
        ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
    }
    {
        using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
        Cont cont;
        cont.insert(1);
        cont.insert(2);
        cont.insert(3);
        DB::WriteBufferFromOwnString wb;
        cont.write(wb);
        DB::ReadBufferFromString rb(wb.str());
        Cont deserialized;
        deserialized.read(rb);
        ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
    }
    {
        using Cont = HashSet<int, DummyHash<int>, HashTableGrower<1>>;
        Cont cont;
        DB::WriteBufferFromOwnString wb;
        cont.writeText(wb);
        std::string expected = "0";
        ASSERT_EQ(wb.str(), expected);
        DB::ReadBufferFromString rb(expected);
        Cont deserialized;
        deserialized.readText(rb);
        ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
    }
    {
        using Cont = HashSet<DB::UInt128, DB::UInt128TrivialHash>;
        Cont cont;
        DB::WriteBufferFromOwnString wb;
        cont.write(wb);
        std::string expected;
        expected += static_cast<char>(0);
        ASSERT_EQ(wb.str(), expected);
        DB::ReadBufferFromString rb(expected);
        Cont deserialized;
        deserialized.read(rb);
        ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
    }
 }
--- a/src/Common/tests/hash_table.cpp
+++ b/src/Common/tests/hash_table.cpp
@ -1,50 +0,0 @@
 #include <iostream>
 #include <iomanip>
 #include <Interpreters/AggregationCommon.h>
 #include <Common/HashTable/HashMap.h>
 #include <Common/HashTable/HashSet.h>
 int main(int, char **)
 {
    {
        using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
        Cont cont;
        cont.insert(1);
        cont.insert(2);
        Cont::LookupResult it;
        bool inserted;
        int key = 3;
        cont.emplace(key, it, inserted);
        std::cerr << inserted << ", " << key << std::endl;
        cont.emplace(key, it, inserted);
        std::cerr << inserted << ", " << key << std::endl;
        for (auto x : cont)
            std::cerr << x.getValue() << std::endl;
        DB::WriteBufferFromOwnString wb;
        cont.writeText(wb);
        std::cerr << "dump: " << wb.str() << std::endl;
    }
    {
        using Cont = HashSet<
            DB::UInt128,
            DB::UInt128TrivialHash>;
        Cont cont;
        DB::WriteBufferFromOwnString wb;
        cont.write(wb);
        std::cerr << "dump: " << wb.str() << std::endl;
    }
    return 0;
 }
--- a/tests/queries/0_stateless/00453_top_k.reference
+++ b/tests/queries/0_stateless/00453_top_k.reference
@ -1 +1,8 @@
 [0,1,2,3,4,5,6,7,8,9]
 0	[[],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5']]]
 1	[[[]],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5'],[NULL,'1','2','3','4','5','6']]]
 2	[[[],[NULL]],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5'],[NULL,'1','2','3','4','5','6'],[NULL,'1','2','3','4','5','6','7']]]
 3	[[[],[NULL],[NULL,'1']]]
 4	[[[],[NULL],[NULL,'1'],[NULL,'1','2']]]
 5	[[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3']]]
 6	[[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4']]]
--- a/tests/queries/0_stateless/00453_top_k.sql
+++ b/tests/queries/0_stateless/00453_top_k.sql
@ -1 +1,15 @@
 SELECT topK(10)(n) FROM (SELECT if(number % 100 < 10, number % 10, number) AS n FROM system.numbers LIMIT 100000);
 SELECT
    k,
    topK(v)
 FROM
 (
    SELECT
        number % 7 AS k,
        arrayMap(x -> arrayMap(x -> if(x = 0, NULL, toString(x)), range(x)), range(intDiv(number, 1))) AS v
    FROM system.numbers
    LIMIT 10
 )
 GROUP BY k
 ORDER BY k ASC