Merge pull request #17845 from kitaisreal/space-saving-remove-last-element-from-map-fix

SpaceSaving remove last element from map fix
This commit is contained in:
alexey-milovidov 2020-12-13 04:09:51 +03:00 committed by GitHub
commit 8df4789113
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 333 additions and 59 deletions

View File

@ -194,9 +194,6 @@ struct HashTableCell
/// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table). /// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table).
static constexpr bool need_zero_value_storage = true; static constexpr bool need_zero_value_storage = true;
/// Whether the cell is deleted.
bool isDeleted() const { return false; }
/// Set the mapped value, if any (for HashMap), to the corresponding `value`. /// Set the mapped value, if any (for HashMap), to the corresponding `value`.
void setMapped(const value_type & /*value*/) {} void setMapped(const value_type & /*value*/) {}
@ -230,6 +227,9 @@ struct HashTableGrower
UInt8 size_degree = initial_size_degree; UInt8 size_degree = initial_size_degree;
static constexpr auto initial_count = 1ULL << initial_size_degree; static constexpr auto initial_count = 1ULL << initial_size_degree;
/// If collision resolution chains are contiguous, we can implement erase operation by moving the elements.
static constexpr auto performs_linear_probing_with_single_step = true;
/// The size of the hash table in the cells. /// The size of the hash table in the cells.
size_t bufSize() const { return 1ULL << size_degree; } size_t bufSize() const { return 1ULL << size_degree; }
@ -277,6 +277,9 @@ template <size_t key_bits>
struct HashTableFixedGrower struct HashTableFixedGrower
{ {
static constexpr auto initial_count = 1ULL << key_bits; static constexpr auto initial_count = 1ULL << key_bits;
static constexpr auto performs_linear_probing_with_single_step = true;
size_t bufSize() const { return 1ULL << key_bits; } size_t bufSize() const { return 1ULL << key_bits; }
size_t place(size_t x) const { return x; } size_t place(size_t x) const { return x; }
/// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently. /// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently.
@ -466,7 +469,7 @@ protected:
*/ */
size_t i = 0; size_t i = 0;
for (; i < old_size; ++i) for (; i < old_size; ++i)
if (!buf[i].isZero(*this) && !buf[i].isDeleted()) if (!buf[i].isZero(*this))
reinsert(buf[i], buf[i].getHash(*this)); reinsert(buf[i], buf[i].getHash(*this));
/** There is also a special case: /** There is also a special case:
@ -477,7 +480,7 @@ protected:
* after transferring all the elements from the old halves you need to [ o x ] * after transferring all the elements from the old halves you need to [ o x ]
* process tail from the collision resolution chain immediately after it [ o x ] * process tail from the collision resolution chain immediately after it [ o x ]
*/ */
for (; !buf[i].isZero(*this) && !buf[i].isDeleted(); ++i) for (; !buf[i].isZero(*this); ++i)
reinsert(buf[i], buf[i].getHash(*this)); reinsert(buf[i], buf[i].getHash(*this));
#ifdef DBMS_HASH_MAP_DEBUG_RESIZES #ifdef DBMS_HASH_MAP_DEBUG_RESIZES
@ -829,6 +832,7 @@ protected:
*/ */
--m_size; --m_size;
buf[place_value].setZero(); buf[place_value].setZero();
inserted = false;
throw; throw;
} }
@ -954,6 +958,97 @@ public:
return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value); return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value);
} }
std::enable_if_t<Grower::performs_linear_probing_with_single_step, void>
ALWAYS_INLINE erase(const Key & x)
{
/** Deletion from open addressing hash table without tombstones
*
* https://en.wikipedia.org/wiki/Linear_probing
* https://en.wikipedia.org/wiki/Open_addressing
* Algorithm without recomputing hash but keep probes difference value (difference of natural cell position and inserted one)
* in cell https://arxiv.org/ftp/arxiv/papers/0909/0909.2547.pdf
*
* Currently we use algorithm with hash recomputing on each step from https://en.wikipedia.org/wiki/Open_addressing
*/
if (Cell::isZero(x, *this))
{
if (this->hasZero())
{
--m_size;
this->clearHasZero();
}
else
{
return;
}
}
size_t hash_value = hash(x);
size_t erased_key_position = findCell(x, hash_value, grower.place(hash_value));
/// Key is not found
if (buf[erased_key_position].isZero(*this))
{
return;
}
/// We need to guarantee loop termination because there will be empty position
assert(m_size < grower.bufSize());
size_t next_position = erased_key_position;
/// Walk to the right through collision resolution chain and move elements to better positions
while (true)
{
next_position = grower.next(next_position);
/// If there's no more elements in the chain
if (buf[next_position].isZero(*this))
break;
/// The optimal position of the element in the cell at next_position
size_t optimal_position = grower.place(buf[next_position].getHash(*this));
/// If position of this element is already optimal - proceed to the next element.
if (optimal_position == next_position)
continue;
/// The case of non overlapping part of chain
if (next_position > erased_key_position
/// Cannot move this element because optimal position is after the freed place
/// The second condition is tricky - if the chain was overlapped before erased_key_position,
/// and the optimal position is actually before in collision resolution chain:
///
/// [*xn***----------------***]
/// ^^-next elem ^
/// | |
/// erased elem the optimal position of the next elem
///
/// so, the next elem should be moved to position of erased elem
&& (optimal_position > erased_key_position) && (optimal_position < next_position))
{
continue;
}
/// The case of overlapping chain
if (next_position < erased_key_position
/// Cannot move this element because optimal position is after the freed place
&& ((optimal_position > erased_key_position) || (optimal_position < next_position)))
{
continue;
}
/// Move the element to the freed place
memcpy(static_cast<void *>(&buf[erased_key_position]), static_cast<void *>(&buf[next_position]), sizeof(Cell));
/// Now we have another freed place
erased_key_position = next_position;
}
buf[erased_key_position].setZero();
--m_size;
}
bool ALWAYS_INLINE has(const Key & x) const bool ALWAYS_INLINE has(const Key & x) const
{ {
if (Cell::isZero(x, *this)) if (Cell::isZero(x, *this))

View File

@ -353,6 +353,7 @@ private:
void destroyLastElement() void destroyLastElement()
{ {
auto last_element = counter_list.back(); auto last_element = counter_list.back();
counter_map.erase(last_element->key);
arena.free(last_element->key); arena.free(last_element->key);
delete last_element; delete last_element;
counter_list.pop_back(); counter_list.pop_back();

View File

@ -10,9 +10,6 @@ target_link_libraries (sip_hash_perf PRIVATE clickhouse_common_io)
add_executable (auto_array auto_array.cpp) add_executable (auto_array auto_array.cpp)
target_link_libraries (auto_array PRIVATE clickhouse_common_io) target_link_libraries (auto_array PRIVATE clickhouse_common_io)
add_executable (hash_table hash_table.cpp)
target_link_libraries (hash_table PRIVATE clickhouse_common_io)
add_executable (small_table small_table.cpp) add_executable (small_table small_table.cpp)
target_link_libraries (small_table PRIVATE clickhouse_common_io) target_link_libraries (small_table PRIVATE clickhouse_common_io)

View File

@ -0,0 +1,210 @@
#include <iomanip>
#include <iostream>
#include <Interpreters/AggregationCommon.h>
#include <Common/HashTable/HashMap.h>
#include <Common/HashTable/HashSet.h>
#include <IO/ReadBufferFromString.h>
#include <gtest/gtest.h>
/// To test dump functionality without using other hashes that can change
template <typename T>
struct DummyHash
{
size_t operator()(T key) const { return T(key); }
};
template<typename HashTable>
std::set<typename HashTable::value_type> convertToSet(const HashTable& table)
{
std::set<typename HashTable::value_type> result;
for (auto v: table)
result.emplace(v.getValue());
return result;
}
TEST(HashTable, Insert)
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
Cont cont;
cont.insert(1);
cont.insert(2);
ASSERT_EQ(cont.size(), 2);
}
TEST(HashTable, Emplace)
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
Cont cont;
Cont::LookupResult it;
bool inserted = false;
cont.emplace(1, it, inserted);
ASSERT_EQ(it->getKey(), 1);
ASSERT_EQ(inserted, true);
cont.emplace(2, it, inserted);
ASSERT_EQ(it->getKey(), 2);
ASSERT_EQ(inserted, true);
cont.emplace(1, it, inserted);
ASSERT_EQ(it->getKey(), 1);
ASSERT_EQ(inserted, false);
}
TEST(HashTable, Lookup)
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
Cont cont;
cont.insert(1);
cont.insert(2);
Cont::LookupResult it = cont.find(1);
ASSERT_TRUE(it != nullptr);
it = cont.find(2);
ASSERT_TRUE(it != nullptr);
it = cont.find(3);
ASSERT_TRUE(it == nullptr);
}
TEST(HashTable, Iteration)
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
Cont cont;
cont.insert(1);
cont.insert(2);
cont.insert(3);
std::set<int> expected = {1, 2, 3};
std::set<int> actual = convertToSet(cont);
ASSERT_EQ(actual, expected);
}
TEST(HashTable, Erase)
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
Cont cont;
for (size_t i = 0; i < 5000; ++i)
{
cont.insert(i);
}
for (size_t i = 0; i < 2500; ++i)
{
cont.erase(i);
}
for (size_t i = 5000; i < 10000; ++i)
{
cont.insert(i);
}
for (size_t i = 5000; i < 10000; ++i)
{
cont.erase(i);
}
for (size_t i = 2500; i < 5000; ++i)
{
cont.erase(i);
}
ASSERT_EQ(cont.size(), 0);
}
TEST(HashTable, SerializationDeserialization)
{
{
/// Use dummy hash to make it reproducible if default hash implementation will be changed
using Cont = HashSet<int, DummyHash<int>, HashTableGrower<1>>;
Cont cont;
cont.insert(1);
cont.insert(2);
cont.insert(3);
DB::WriteBufferFromOwnString wb;
cont.writeText(wb);
std::string expected = "3,1,2,3";
ASSERT_EQ(wb.str(), expected);
DB::ReadBufferFromString rb(expected);
Cont deserialized;
deserialized.readText(rb);
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
}
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
Cont cont;
cont.insert(1);
cont.insert(2);
cont.insert(3);
DB::WriteBufferFromOwnString wb;
cont.write(wb);
DB::ReadBufferFromString rb(wb.str());
Cont deserialized;
deserialized.read(rb);
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
}
{
using Cont = HashSet<int, DummyHash<int>, HashTableGrower<1>>;
Cont cont;
DB::WriteBufferFromOwnString wb;
cont.writeText(wb);
std::string expected = "0";
ASSERT_EQ(wb.str(), expected);
DB::ReadBufferFromString rb(expected);
Cont deserialized;
deserialized.readText(rb);
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
}
{
using Cont = HashSet<DB::UInt128, DB::UInt128TrivialHash>;
Cont cont;
DB::WriteBufferFromOwnString wb;
cont.write(wb);
std::string expected;
expected += static_cast<char>(0);
ASSERT_EQ(wb.str(), expected);
DB::ReadBufferFromString rb(expected);
Cont deserialized;
deserialized.read(rb);
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
}
}

View File

@ -1,50 +0,0 @@
#include <iostream>
#include <iomanip>
#include <Interpreters/AggregationCommon.h>
#include <Common/HashTable/HashMap.h>
#include <Common/HashTable/HashSet.h>
int main(int, char **)
{
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrower<1>>;
Cont cont;
cont.insert(1);
cont.insert(2);
Cont::LookupResult it;
bool inserted;
int key = 3;
cont.emplace(key, it, inserted);
std::cerr << inserted << ", " << key << std::endl;
cont.emplace(key, it, inserted);
std::cerr << inserted << ", " << key << std::endl;
for (auto x : cont)
std::cerr << x.getValue() << std::endl;
DB::WriteBufferFromOwnString wb;
cont.writeText(wb);
std::cerr << "dump: " << wb.str() << std::endl;
}
{
using Cont = HashSet<
DB::UInt128,
DB::UInt128TrivialHash>;
Cont cont;
DB::WriteBufferFromOwnString wb;
cont.write(wb);
std::cerr << "dump: " << wb.str() << std::endl;
}
return 0;
}

View File

@ -1 +1,8 @@
[0,1,2,3,4,5,6,7,8,9] [0,1,2,3,4,5,6,7,8,9]
0 [[],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5']]]
1 [[[]],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5'],[NULL,'1','2','3','4','5','6']]]
2 [[[],[NULL]],[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4'],[NULL,'1','2','3','4','5'],[NULL,'1','2','3','4','5','6'],[NULL,'1','2','3','4','5','6','7']]]
3 [[[],[NULL],[NULL,'1']]]
4 [[[],[NULL],[NULL,'1'],[NULL,'1','2']]]
5 [[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3']]]
6 [[[],[NULL],[NULL,'1'],[NULL,'1','2'],[NULL,'1','2','3'],[NULL,'1','2','3','4']]]

View File

@ -1 +1,15 @@
SELECT topK(10)(n) FROM (SELECT if(number % 100 < 10, number % 10, number) AS n FROM system.numbers LIMIT 100000); SELECT topK(10)(n) FROM (SELECT if(number % 100 < 10, number % 10, number) AS n FROM system.numbers LIMIT 100000);
SELECT
k,
topK(v)
FROM
(
SELECT
number % 7 AS k,
arrayMap(x -> arrayMap(x -> if(x = 0, NULL, toString(x)), range(x)), range(intDiv(number, 1))) AS v
FROM system.numbers
LIMIT 10
)
GROUP BY k
ORDER BY k ASC