ClickHouse/src/Common/tests/gtest_hash_table.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

430 lines
11 KiB
C++
Raw Normal View History

2020-12-12 12:08:46 +00:00
#include <iomanip>
#include <iostream>
#include <Interpreters/AggregationCommon.h>
#include <Common/HashTable/HashMap.h>
#include <Common/HashTable/HashSet.h>
2021-04-25 10:02:19 +00:00
#include <Common/HashTable/Hash.h>
2020-12-12 12:08:46 +00:00
#include <IO/ReadBufferFromString.h>
2021-05-09 20:13:19 +00:00
#include <IO/WriteHelpers.h>
2020-12-12 12:08:46 +00:00
#include <gtest/gtest.h>
2021-04-25 10:02:19 +00:00
using namespace DB;
namespace
{
std::vector<UInt64> getVectorWithNumbersUpToN(size_t n)
{
std::vector<UInt64> res(n);
std::iota(res.begin(), res.end(), 0);
return res;
}
}
2021-04-25 10:02:19 +00:00
2020-12-12 12:08:46 +00:00
/// To test dump functionality without using other hashes that can change
template <typename T>
struct DummyHash
{
size_t operator()(T key) const { return T(key); }
};
template<typename HashTable>
2021-05-09 20:13:19 +00:00
std::set<std::string> convertToSet(const HashTable & table)
2020-12-12 12:08:46 +00:00
{
2021-05-09 20:13:19 +00:00
std::set<std::string> result;
2020-12-12 15:57:07 +00:00
2020-12-12 12:08:46 +00:00
for (auto v: table)
2021-05-09 20:13:19 +00:00
result.emplace(toString(v.getValue()));
2020-12-12 12:08:46 +00:00
return result;
}
TEST(HashTable, Insert)
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrowerWithPrecalculation<1>>;
2020-12-12 12:08:46 +00:00
Cont cont;
cont.insert(1);
cont.insert(2);
ASSERT_EQ(cont.size(), 2);
}
TEST(HashTable, Emplace)
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrowerWithPrecalculation<1>>;
2020-12-12 12:08:46 +00:00
Cont cont;
Cont::LookupResult it;
bool inserted = false;
cont.emplace(1, it, inserted);
ASSERT_EQ(it->getKey(), 1);
ASSERT_EQ(inserted, true);
cont.emplace(2, it, inserted);
ASSERT_EQ(it->getKey(), 2);
ASSERT_EQ(inserted, true);
cont.emplace(1, it, inserted);
ASSERT_EQ(it->getKey(), 1);
ASSERT_EQ(inserted, false);
}
TEST(HashTable, Lookup)
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrowerWithPrecalculation<1>>;
2020-12-12 12:08:46 +00:00
Cont cont;
cont.insert(1);
cont.insert(2);
Cont::LookupResult it = cont.find(1);
ASSERT_TRUE(it != nullptr);
it = cont.find(2);
ASSERT_TRUE(it != nullptr);
it = cont.find(3);
ASSERT_TRUE(it == nullptr);
}
TEST(HashTable, Iteration)
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrowerWithPrecalculation<1>>;
2020-12-12 12:08:46 +00:00
Cont cont;
cont.insert(1);
cont.insert(2);
cont.insert(3);
2021-05-09 20:13:19 +00:00
std::set<std::string> expected = {"1", "2", "3"};
std::set<std::string> actual = convertToSet(cont);
2020-12-12 12:08:46 +00:00
ASSERT_EQ(actual, expected);
}
TEST(HashTable, Erase)
{
{
2020-12-13 10:30:25 +00:00
/// Check zero element deletion
using Cont = HashSet<int, DummyHash<int>, HashTableGrowerWithPrecalculation<4>>;
2020-12-13 10:30:25 +00:00
Cont cont;
2020-12-12 12:08:46 +00:00
2020-12-13 10:30:25 +00:00
cont.insert(0);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
ASSERT_TRUE(cont.find(0) != nullptr && cont.find(0)->getKey() == 0);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
cont.erase(0);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
ASSERT_TRUE(cont.find(0) == nullptr);
2020-12-12 12:08:46 +00:00
}
2020-12-13 10:30:25 +00:00
{
using Cont = HashSet<int, DummyHash<int>, HashTableGrowerWithPrecalculation<4>>;
2020-12-13 10:30:25 +00:00
Cont cont;
2020-12-12 12:08:46 +00:00
2020-12-13 10:30:25 +00:00
/// [.(1)..............] erase of (1).
cont.insert(1);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
ASSERT_TRUE(cont.find(1) != nullptr && cont.find(1)->getKey() == 1);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
cont.erase(1);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
ASSERT_TRUE(cont.find(1) == nullptr);
}
2020-12-12 12:08:46 +00:00
{
using Cont = HashSet<int, DummyHash<int>, HashTableGrowerWithPrecalculation<4>>;
2020-12-13 10:30:25 +00:00
Cont cont;
/// [.(1)(2)(3)............] erase of (1) does not break search for (2) (3).
cont.insert(1);
cont.insert(2);
cont.insert(3);
cont.erase(1);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
ASSERT_TRUE(cont.find(1) == nullptr);
ASSERT_TRUE(cont.find(2) != nullptr && cont.find(2)->getKey() == 2);
ASSERT_TRUE(cont.find(3) != nullptr && cont.find(3)->getKey() == 3);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
cont.erase(2);
cont.erase(3);
ASSERT_TRUE(cont.find(2) == nullptr);
ASSERT_TRUE(cont.find(3) == nullptr);
ASSERT_EQ(cont.size(), 0);
2020-12-12 12:08:46 +00:00
}
2020-12-13 10:30:25 +00:00
{
using Cont = HashSet<int, DummyHash<int>, HashTableGrowerWithPrecalculation<4>>;
2020-12-13 10:30:25 +00:00
Cont cont;
2020-12-12 12:08:46 +00:00
2020-12-13 10:30:25 +00:00
/// [.(1)(17).............] erase of (1) breaks search for (17) because their natural position is 1.
cont.insert(1);
cont.insert(17);
cont.erase(1);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
ASSERT_TRUE(cont.find(1) == nullptr);
ASSERT_TRUE(cont.find(17) != nullptr && cont.find(17)->getKey() == 17);
}
2020-12-12 12:08:46 +00:00
{
using Cont = HashSet<int, DummyHash<int>, HashTableGrowerWithPrecalculation<4>>;
2020-12-13 10:30:25 +00:00
Cont cont;
/// [.(1)(2)(3)(17)...........] erase of (2) breaks search for (17) because their natural position is 1.
cont.insert(1);
cont.insert(2);
cont.insert(3);
cont.insert(17);
cont.erase(2);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
ASSERT_TRUE(cont.find(2) == nullptr);
ASSERT_TRUE(cont.find(1) != nullptr && cont.find(1)->getKey() == 1);
ASSERT_TRUE(cont.find(3) != nullptr && cont.find(3)->getKey() == 3);
ASSERT_TRUE(cont.find(17) != nullptr && cont.find(17)->getKey() == 17);
2020-12-12 12:08:46 +00:00
}
2020-12-13 10:30:25 +00:00
{
using Cont = HashSet<int, DummyHash<int>, HashTableGrowerWithPrecalculation<4>>;
2020-12-13 10:30:25 +00:00
Cont cont;
2020-12-12 12:08:46 +00:00
2020-12-13 10:30:25 +00:00
/// [(16)(30)............(14)(15)] erase of (16) breaks search for (30) because their natural position is 14.
cont.insert(14);
cont.insert(15);
cont.insert(16);
cont.insert(30);
cont.erase(16);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
ASSERT_TRUE(cont.find(16) == nullptr);
ASSERT_TRUE(cont.find(14) != nullptr && cont.find(14)->getKey() == 14);
ASSERT_TRUE(cont.find(15) != nullptr && cont.find(15)->getKey() == 15);
ASSERT_TRUE(cont.find(30) != nullptr && cont.find(30)->getKey() == 30);
}
2020-12-12 12:08:46 +00:00
{
using Cont = HashSet<int, DummyHash<int>, HashTableGrowerWithPrecalculation<4>>;
2020-12-13 10:30:25 +00:00
Cont cont;
/// [(16)(30)............(14)(15)] erase of (15) breaks search for (30) because their natural position is 14.
cont.insert(14);
cont.insert(15);
cont.insert(16);
cont.insert(30);
cont.erase(15);
2020-12-13 11:35:52 +00:00
2020-12-13 10:30:25 +00:00
ASSERT_TRUE(cont.find(15) == nullptr);
ASSERT_TRUE(cont.find(14) != nullptr && cont.find(14)->getKey() == 14);
ASSERT_TRUE(cont.find(16) != nullptr && cont.find(16)->getKey() == 16);
ASSERT_TRUE(cont.find(30) != nullptr && cont.find(30)->getKey() == 30);
2020-12-12 12:08:46 +00:00
}
2020-12-13 10:30:25 +00:00
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrowerWithPrecalculation<1>>;
2020-12-13 10:30:25 +00:00
Cont cont;
for (int i = 0; i < 5000; ++i)
2020-12-13 10:30:25 +00:00
{
cont.insert(i);
}
for (int i = 0; i < 2500; ++i)
2020-12-13 10:30:25 +00:00
{
cont.erase(i);
}
2020-12-12 12:08:46 +00:00
for (int i = 5000; i < 10000; ++i)
2020-12-13 10:30:25 +00:00
{
cont.insert(i);
}
for (int i = 5000; i < 10000; ++i)
2020-12-13 10:30:25 +00:00
{
cont.erase(i);
}
for (int i = 2500; i < 5000; ++i)
2020-12-13 10:30:25 +00:00
{
cont.erase(i);
}
ASSERT_EQ(cont.size(), 0);
}
2020-12-12 12:08:46 +00:00
}
TEST(HashTable, SerializationDeserialization)
{
{
/// Use dummy hash to make it reproducible if default hash implementation will be changed
using Cont = HashSet<int, DummyHash<int>, HashTableGrowerWithPrecalculation<1>>;
2020-12-12 12:08:46 +00:00
Cont cont;
cont.insert(1);
cont.insert(2);
cont.insert(3);
2021-04-25 10:02:19 +00:00
WriteBufferFromOwnString wb;
2020-12-12 12:08:46 +00:00
cont.writeText(wb);
std::string expected = "3,1,2,3";
ASSERT_EQ(wb.str(), expected);
2021-04-25 10:02:19 +00:00
ReadBufferFromString rb(expected);
2020-12-12 12:08:46 +00:00
Cont deserialized;
deserialized.readText(rb);
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
}
{
using Cont = HashSet<int, DefaultHash<int>, HashTableGrowerWithPrecalculation<1>>;
2020-12-12 12:08:46 +00:00
Cont cont;
cont.insert(1);
cont.insert(2);
cont.insert(3);
2021-04-25 10:02:19 +00:00
WriteBufferFromOwnString wb;
2020-12-12 12:08:46 +00:00
cont.write(wb);
2021-04-25 10:02:19 +00:00
ReadBufferFromString rb(wb.str());
2020-12-12 12:08:46 +00:00
Cont deserialized;
deserialized.read(rb);
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
}
{
using Cont = HashSet<int, DummyHash<int>, HashTableGrowerWithPrecalculation<1>>;
2020-12-12 12:08:46 +00:00
Cont cont;
2021-04-25 10:02:19 +00:00
WriteBufferFromOwnString wb;
2020-12-12 12:08:46 +00:00
cont.writeText(wb);
std::string expected = "0";
ASSERT_EQ(wb.str(), expected);
2021-04-25 10:02:19 +00:00
ReadBufferFromString rb(expected);
2020-12-12 12:08:46 +00:00
Cont deserialized;
deserialized.readText(rb);
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
}
{
2021-04-25 10:02:19 +00:00
using Cont = HashSet<UInt128, UInt128TrivialHash>;
2020-12-12 12:08:46 +00:00
Cont cont;
2021-04-25 10:02:19 +00:00
WriteBufferFromOwnString wb;
2020-12-12 12:08:46 +00:00
cont.write(wb);
std::string expected;
expected += static_cast<char>(0);
ASSERT_EQ(wb.str(), expected);
2021-04-25 10:02:19 +00:00
ReadBufferFromString rb(expected);
2020-12-12 12:08:46 +00:00
Cont deserialized;
deserialized.read(rb);
ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
}
}
template <typename T>
struct IdentityHash
{
size_t operator()(T x) const { return x; }
};
struct OneElementResizeGrower
{
/// If collision resolution chains are contiguous, we can implement erase operation by moving the elements.
static constexpr auto performs_linear_probing_with_single_step = true;
static constexpr size_t initial_count = 1;
size_t bufSize() const { return buf_size; }
size_t place(size_t x) const { return x % buf_size; }
size_t next(size_t pos) const { return (pos + 1) % buf_size; }
bool overflow(size_t elems) const { return elems >= buf_size; }
void increaseSize() { ++buf_size; }
void set(size_t) { }
void setBufSize(size_t buf_size_) { buf_size = buf_size_; }
size_t buf_size = initial_count;
};
TEST(HashTable, Resize)
{
{
/// Test edge case if after resize all cells are resized in end of buf and will take half of
/// hash table place.
using HashSet = HashSet<int, IdentityHash<int>, OneElementResizeGrower>;
HashSet cont;
cont.insert(3);
cont.insert(1);
2021-05-09 20:13:19 +00:00
std::set<std::string> expected = {"1", "3"};
std::set<std::string> actual = convertToSet(cont);
ASSERT_EQ(actual, expected);
}
}
using HashSetContent = std::vector<UInt64>;
class TwoLevelHashSetFixture : public ::testing::TestWithParam<HashSetContent>
{
};
TEST_P(TwoLevelHashSetFixture, WriteAsSingleLevel)
{
using Key = UInt64;
{
const auto & hash_set_content = GetParam();
TwoLevelHashSet<Key, HashCRC32<Key>> two_level;
for (const auto & elem : hash_set_content)
two_level.insert(elem);
WriteBufferFromOwnString wb;
two_level.writeAsSingleLevel(wb);
ReadBufferFromString rb(wb.str());
HashSet<Key, HashCRC32<Key>> single_level;
single_level.read(rb);
EXPECT_EQ(single_level.size(), hash_set_content.size());
for (const auto & elem : hash_set_content)
EXPECT_NE(single_level.find(elem), nullptr);
}
}
INSTANTIATE_TEST_SUITE_P(
TwoLevelHashSetTests,
TwoLevelHashSetFixture,
::testing::Values(
HashSetContent{},
getVectorWithNumbersUpToN(1),
getVectorWithNumbersUpToN(100),
getVectorWithNumbersUpToN(1000),
getVectorWithNumbersUpToN(10000),
getVectorWithNumbersUpToN(100000),
getVectorWithNumbersUpToN(1000000)));