mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-15 10:52:30 +00:00
97f2a2213e
* Move some code outside dbms/src folder * Fix paths
554 lines
16 KiB
C++
554 lines
16 KiB
C++
#pragma once
|
|
|
|
#include <math.h>
|
|
|
|
#include <common/types.h>
|
|
|
|
#include <IO/WriteBuffer.h>
|
|
#include <IO/WriteHelpers.h>
|
|
#include <IO/ReadBuffer.h>
|
|
#include <IO/ReadHelpers.h>
|
|
#include <IO/VarInt.h>
|
|
|
|
#include <Common/HashTable/HashTableAllocator.h>
|
|
#include <Common/HashTable/Hash.h>
|
|
|
|
|
|
/** Approximate calculation of anything, as usual, is constructed according to the following scheme:
|
|
* - some data structure is used to calculate the value of X;
|
|
* - Not all values are added to the data structure, but only selected ones (according to some selectivity criteria);
|
|
* - after processing all elements, the data structure is in some state S;
|
|
* - as an approximate value of X, the value calculated according to the maximum likelihood principle is returned:
|
|
* at what real value X, the probability of finding the data structure in the obtained state S is maximal.
|
|
*/
|
|
|
|
/** In particular, what is described below can be found by the name of the BJKST algorithm.
|
|
*/
|
|
|
|
/** Very simple hash-set for approximate number of unique values.
|
|
* Works like this:
|
|
* - you can insert UInt64;
|
|
* - before insertion, first the hash function UInt64 -> UInt32 is calculated;
|
|
* - the original value is not saved (lost);
|
|
* - further all operations are made with these hashes;
|
|
* - hash table is constructed according to the scheme:
|
|
* - open addressing (one buffer, position in buffer is calculated by taking remainder of division by its size);
|
|
* - linear probing (if the cell already has a value, then the cell following it is taken, etc.);
|
|
* - the missing value is zero-encoded; to remember presence of zero in set, separate variable of type bool is used;
|
|
* - buffer growth by 2 times when filling more than 50%;
|
|
* - if the set has more UNIQUES_HASH_MAX_SIZE elements, then all the elements are removed from the set,
|
|
* not divisible by 2, and then all elements that do not divide by 2 are not inserted into the set;
|
|
* - if the situation repeats, then only elements dividing by 4, etc., are taken.
|
|
* - the size() method returns an approximate number of elements that have been inserted into the set;
|
|
* - there are methods for quick reading and writing in binary and text form.
|
|
*/
|
|
|
|
/// The maximum degree of buffer size before the values are discarded
|
|
#define UNIQUES_HASH_MAX_SIZE_DEGREE 17
|
|
|
|
/// The maximum number of elements before the values are discarded
|
|
#define UNIQUES_HASH_MAX_SIZE (1ULL << (UNIQUES_HASH_MAX_SIZE_DEGREE - 1))
|
|
|
|
/** The number of least significant bits used for thinning. The remaining high-order bits are used to determine the position in the hash table.
|
|
* (high-order bits are taken because the younger bits will be constant after dropping some of the values)
|
|
*/
|
|
#define UNIQUES_HASH_BITS_FOR_SKIP (32 - UNIQUES_HASH_MAX_SIZE_DEGREE)
|
|
|
|
/// Initial buffer size degree
|
|
#define UNIQUES_HASH_SET_INITIAL_SIZE_DEGREE 4
|
|
|
|
|
|
/** This hash function is not the most optimal, but UniquesHashSet states counted with it,
|
|
* stored in many places on disks (in the Yandex.Metrika), so it continues to be used.
|
|
*/
|
|
struct UniquesHashSetDefaultHash
|
|
{
|
|
size_t operator() (UInt64 x) const
|
|
{
|
|
return intHash32<0>(x);
|
|
}
|
|
};
|
|
|
|
|
|
template <typename Hash = UniquesHashSetDefaultHash>
|
|
class UniquesHashSet : private HashTableAllocatorWithStackMemory<(1ULL << UNIQUES_HASH_SET_INITIAL_SIZE_DEGREE) * sizeof(UInt32)>
|
|
{
|
|
private:
|
|
using Value = UInt64;
|
|
using HashValue = UInt32;
|
|
using Allocator = HashTableAllocatorWithStackMemory<(1ULL << UNIQUES_HASH_SET_INITIAL_SIZE_DEGREE) * sizeof(UInt32)>;
|
|
|
|
UInt32 m_size; /// Number of elements
|
|
UInt8 size_degree; /// The size of the table as a power of 2
|
|
UInt8 skip_degree; /// Skip elements not divisible by 2 ^ skip_degree
|
|
bool has_zero; /// The hash table contains an element with a hash value of 0.
|
|
|
|
HashValue * buf;
|
|
|
|
#ifdef UNIQUES_HASH_SET_COUNT_COLLISIONS
|
|
/// For profiling.
|
|
mutable size_t collisions;
|
|
#endif
|
|
|
|
void alloc(UInt8 new_size_degree)
|
|
{
|
|
buf = reinterpret_cast<HashValue *>(Allocator::alloc((1ULL << new_size_degree) * sizeof(buf[0])));
|
|
size_degree = new_size_degree;
|
|
}
|
|
|
|
void free()
|
|
{
|
|
if (buf)
|
|
{
|
|
Allocator::free(buf, buf_size() * sizeof(buf[0]));
|
|
buf = nullptr;
|
|
}
|
|
}
|
|
|
|
inline size_t buf_size() const { return 1ULL << size_degree; }
|
|
inline size_t max_fill() const { return 1ULL << (size_degree - 1); }
|
|
inline size_t mask() const { return buf_size() - 1; }
|
|
inline size_t place(HashValue x) const { return (x >> UNIQUES_HASH_BITS_FOR_SKIP) & mask(); }
|
|
|
|
/// The value is divided by 2 ^ skip_degree
|
|
inline bool good(HashValue hash) const
|
|
{
|
|
return hash == ((hash >> skip_degree) << skip_degree);
|
|
}
|
|
|
|
HashValue hash(Value key) const
|
|
{
|
|
return Hash()(key);
|
|
}
|
|
|
|
/// Delete all values whose hashes do not divide by 2 ^ skip_degree
|
|
void rehash()
|
|
{
|
|
for (size_t i = 0; i < buf_size(); ++i)
|
|
{
|
|
if (buf[i])
|
|
{
|
|
if (!good(buf[i]))
|
|
{
|
|
buf[i] = 0;
|
|
--m_size;
|
|
}
|
|
/** After removing the elements, there may have been room for items,
|
|
* which were placed further than necessary, due to a collision.
|
|
* You need to move them.
|
|
*/
|
|
else if (i != place(buf[i]))
|
|
{
|
|
HashValue x = buf[i];
|
|
buf[i] = 0;
|
|
reinsertImpl(x);
|
|
}
|
|
}
|
|
}
|
|
|
|
/** We must process first collision resolution chain once again.
|
|
* Look at the comment in "resize" function.
|
|
*/
|
|
for (size_t i = 0; i < buf_size() && buf[i]; ++i)
|
|
{
|
|
if (i != place(buf[i]))
|
|
{
|
|
HashValue x = buf[i];
|
|
buf[i] = 0;
|
|
reinsertImpl(x);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Increase the size of the buffer 2 times or up to new_size_degree, if it is non-zero.
|
|
void resize(size_t new_size_degree = 0)
|
|
{
|
|
size_t old_size = buf_size();
|
|
|
|
if (!new_size_degree)
|
|
new_size_degree = size_degree + 1;
|
|
|
|
/// Expand the space.
|
|
buf = reinterpret_cast<HashValue *>(Allocator::realloc(buf, old_size * sizeof(buf[0]), (1ULL << new_size_degree) * sizeof(buf[0])));
|
|
size_degree = new_size_degree;
|
|
|
|
/** Now some items may need to be moved to a new location.
|
|
* The element can stay in place, or move to a new location "on the right",
|
|
* or move to the left of the collision resolution chain, because the elements to the left of it have been moved to the new "right" location.
|
|
* There is also a special case
|
|
* if the element was to be at the end of the old buffer, [ x]
|
|
* but is at the beginning because of the collision resolution chain, [o x]
|
|
* then after resizing, it will first be out of place again, [ xo ]
|
|
* and in order to transfer it to where you need it,
|
|
* will have to be after transferring all elements from the old half [ o x ]
|
|
* process another tail from the collision resolution chain immediately after it [ o x ]
|
|
* This is why || buf[i] below.
|
|
*/
|
|
for (size_t i = 0; i < old_size || buf[i]; ++i)
|
|
{
|
|
HashValue x = buf[i];
|
|
if (!x)
|
|
continue;
|
|
|
|
size_t place_value = place(x);
|
|
|
|
/// The element is in its place.
|
|
if (place_value == i)
|
|
continue;
|
|
|
|
while (buf[place_value] && buf[place_value] != x)
|
|
{
|
|
++place_value;
|
|
place_value &= mask();
|
|
|
|
#ifdef UNIQUES_HASH_SET_COUNT_COLLISIONS
|
|
++collisions;
|
|
#endif
|
|
}
|
|
|
|
/// The element remained in its place.
|
|
if (buf[place_value] == x)
|
|
continue;
|
|
|
|
buf[place_value] = x;
|
|
buf[i] = 0;
|
|
}
|
|
}
|
|
|
|
/// Insert a value.
|
|
void insertImpl(HashValue x)
|
|
{
|
|
if (x == 0)
|
|
{
|
|
m_size += !has_zero;
|
|
has_zero = true;
|
|
return;
|
|
}
|
|
|
|
size_t place_value = place(x);
|
|
while (buf[place_value] && buf[place_value] != x)
|
|
{
|
|
++place_value;
|
|
place_value &= mask();
|
|
|
|
#ifdef UNIQUES_HASH_SET_COUNT_COLLISIONS
|
|
++collisions;
|
|
#endif
|
|
}
|
|
|
|
if (buf[place_value] == x)
|
|
return;
|
|
|
|
buf[place_value] = x;
|
|
++m_size;
|
|
}
|
|
|
|
/** Insert a value into the new buffer that was in the old buffer.
|
|
* Used when increasing the size of the buffer, as well as when reading from a file.
|
|
*/
|
|
void reinsertImpl(HashValue x)
|
|
{
|
|
size_t place_value = place(x);
|
|
while (buf[place_value])
|
|
{
|
|
++place_value;
|
|
place_value &= mask();
|
|
|
|
#ifdef UNIQUES_HASH_SET_COUNT_COLLISIONS
|
|
++collisions;
|
|
#endif
|
|
}
|
|
|
|
buf[place_value] = x;
|
|
}
|
|
|
|
/** If the hash table is full enough, then do resize.
|
|
* If there are too many items, then throw half the pieces until they are small enough.
|
|
*/
|
|
void shrinkIfNeed()
|
|
{
|
|
if (unlikely(m_size > max_fill()))
|
|
{
|
|
if (m_size > UNIQUES_HASH_MAX_SIZE)
|
|
{
|
|
while (m_size > UNIQUES_HASH_MAX_SIZE)
|
|
{
|
|
++skip_degree;
|
|
rehash();
|
|
}
|
|
}
|
|
else
|
|
resize();
|
|
}
|
|
}
|
|
|
|
|
|
public:
|
|
using value_type = Value;
|
|
|
|
UniquesHashSet() :
|
|
m_size(0),
|
|
skip_degree(0),
|
|
has_zero(false)
|
|
{
|
|
alloc(UNIQUES_HASH_SET_INITIAL_SIZE_DEGREE);
|
|
#ifdef UNIQUES_HASH_SET_COUNT_COLLISIONS
|
|
collisions = 0;
|
|
#endif
|
|
}
|
|
|
|
UniquesHashSet(const UniquesHashSet & rhs)
|
|
: m_size(rhs.m_size), skip_degree(rhs.skip_degree), has_zero(rhs.has_zero)
|
|
{
|
|
alloc(rhs.size_degree);
|
|
memcpy(buf, rhs.buf, buf_size() * sizeof(buf[0]));
|
|
}
|
|
|
|
UniquesHashSet & operator= (const UniquesHashSet & rhs)
|
|
{
|
|
if (size_degree != rhs.size_degree)
|
|
{
|
|
free();
|
|
alloc(rhs.size_degree);
|
|
}
|
|
|
|
m_size = rhs.m_size;
|
|
skip_degree = rhs.skip_degree;
|
|
has_zero = rhs.has_zero;
|
|
|
|
memcpy(buf, rhs.buf, buf_size() * sizeof(buf[0]));
|
|
|
|
return *this;
|
|
}
|
|
|
|
~UniquesHashSet()
|
|
{
|
|
free();
|
|
}
|
|
|
|
void insert(Value x)
|
|
{
|
|
HashValue hash_value = hash(x);
|
|
if (!good(hash_value))
|
|
return;
|
|
|
|
insertImpl(hash_value);
|
|
shrinkIfNeed();
|
|
}
|
|
|
|
size_t size() const
|
|
{
|
|
if (0 == skip_degree)
|
|
return m_size;
|
|
|
|
size_t res = m_size * (1ULL << skip_degree);
|
|
|
|
/** Pseudo-random remainder - in order to be not visible,
|
|
* that the number is divided by the power of two.
|
|
*/
|
|
res += (intHashCRC32(m_size) & ((1ULL << skip_degree) - 1));
|
|
|
|
/** Correction of a systematic error due to collisions during hashing in UInt32.
|
|
* `fixed_res(res)` formula
|
|
* - with how many different elements of fixed_res,
|
|
* when randomly scattered across 2^32 buckets,
|
|
* filled buckets with average of res is obtained.
|
|
*/
|
|
size_t p32 = 1ULL << 32;
|
|
size_t fixed_res = round(p32 * (log(p32) - log(p32 - res)));
|
|
return fixed_res;
|
|
}
|
|
|
|
void merge(const UniquesHashSet & rhs)
|
|
{
|
|
if (rhs.skip_degree > skip_degree)
|
|
{
|
|
skip_degree = rhs.skip_degree;
|
|
rehash();
|
|
}
|
|
|
|
if (!has_zero && rhs.has_zero)
|
|
{
|
|
has_zero = true;
|
|
++m_size;
|
|
shrinkIfNeed();
|
|
}
|
|
|
|
for (size_t i = 0; i < rhs.buf_size(); ++i)
|
|
{
|
|
if (rhs.buf[i] && good(rhs.buf[i]))
|
|
{
|
|
insertImpl(rhs.buf[i]);
|
|
shrinkIfNeed();
|
|
}
|
|
}
|
|
}
|
|
|
|
void write(DB::WriteBuffer & wb) const
|
|
{
|
|
if (m_size > UNIQUES_HASH_MAX_SIZE)
|
|
throw Poco::Exception("Cannot write UniquesHashSet: too large size_degree.");
|
|
|
|
DB::writeIntBinary(skip_degree, wb);
|
|
DB::writeVarUInt(m_size, wb);
|
|
|
|
if (has_zero)
|
|
{
|
|
HashValue x = 0;
|
|
DB::writeIntBinary(x, wb);
|
|
}
|
|
|
|
for (size_t i = 0; i < buf_size(); ++i)
|
|
if (buf[i])
|
|
DB::writeIntBinary(buf[i], wb);
|
|
}
|
|
|
|
void read(DB::ReadBuffer & rb)
|
|
{
|
|
has_zero = false;
|
|
|
|
DB::readIntBinary(skip_degree, rb);
|
|
DB::readVarUInt(m_size, rb);
|
|
|
|
if (m_size > UNIQUES_HASH_MAX_SIZE)
|
|
throw Poco::Exception("Cannot read UniquesHashSet: too large size_degree.");
|
|
|
|
free();
|
|
|
|
UInt8 new_size_degree = m_size <= 1
|
|
? UNIQUES_HASH_SET_INITIAL_SIZE_DEGREE
|
|
: std::max(UNIQUES_HASH_SET_INITIAL_SIZE_DEGREE, static_cast<int>(log2(m_size - 1)) + 2);
|
|
|
|
alloc(new_size_degree);
|
|
|
|
for (size_t i = 0; i < m_size; ++i)
|
|
{
|
|
HashValue x = 0;
|
|
DB::readIntBinary(x, rb);
|
|
if (x == 0)
|
|
has_zero = true;
|
|
else
|
|
reinsertImpl(x);
|
|
}
|
|
}
|
|
|
|
void readAndMerge(DB::ReadBuffer & rb)
|
|
{
|
|
UInt8 rhs_skip_degree = 0;
|
|
DB::readIntBinary(rhs_skip_degree, rb);
|
|
|
|
if (rhs_skip_degree > skip_degree)
|
|
{
|
|
skip_degree = rhs_skip_degree;
|
|
rehash();
|
|
}
|
|
|
|
size_t rhs_size = 0;
|
|
DB::readVarUInt(rhs_size, rb);
|
|
|
|
if (rhs_size > UNIQUES_HASH_MAX_SIZE)
|
|
throw Poco::Exception("Cannot read UniquesHashSet: too large size_degree.");
|
|
|
|
if ((1ULL << size_degree) < rhs_size)
|
|
{
|
|
UInt8 new_size_degree = std::max(UNIQUES_HASH_SET_INITIAL_SIZE_DEGREE, static_cast<int>(log2(rhs_size - 1)) + 2);
|
|
resize(new_size_degree);
|
|
}
|
|
|
|
for (size_t i = 0; i < rhs_size; ++i)
|
|
{
|
|
HashValue x = 0;
|
|
DB::readIntBinary(x, rb);
|
|
insertHash(x);
|
|
}
|
|
}
|
|
|
|
static void skip(DB::ReadBuffer & rb)
|
|
{
|
|
size_t size = 0;
|
|
|
|
rb.ignore();
|
|
DB::readVarUInt(size, rb);
|
|
|
|
if (size > UNIQUES_HASH_MAX_SIZE)
|
|
throw Poco::Exception("Cannot read UniquesHashSet: too large size_degree.");
|
|
|
|
rb.ignore(sizeof(HashValue) * size);
|
|
}
|
|
|
|
void writeText(DB::WriteBuffer & wb) const
|
|
{
|
|
if (m_size > UNIQUES_HASH_MAX_SIZE)
|
|
throw Poco::Exception("Cannot write UniquesHashSet: too large size_degree.");
|
|
|
|
DB::writeIntText(skip_degree, wb);
|
|
wb.write(",", 1);
|
|
DB::writeIntText(m_size, wb);
|
|
|
|
if (has_zero)
|
|
wb.write(",0", 2);
|
|
|
|
for (size_t i = 0; i < buf_size(); ++i)
|
|
{
|
|
if (buf[i])
|
|
{
|
|
wb.write(",", 1);
|
|
DB::writeIntText(buf[i], wb);
|
|
}
|
|
}
|
|
}
|
|
|
|
void readText(DB::ReadBuffer & rb)
|
|
{
|
|
has_zero = false;
|
|
|
|
DB::readIntText(skip_degree, rb);
|
|
DB::assertChar(',', rb);
|
|
DB::readIntText(m_size, rb);
|
|
|
|
if (m_size > UNIQUES_HASH_MAX_SIZE)
|
|
throw Poco::Exception("Cannot read UniquesHashSet: too large size_degree.");
|
|
|
|
free();
|
|
|
|
UInt8 new_size_degree = m_size <= 1
|
|
? UNIQUES_HASH_SET_INITIAL_SIZE_DEGREE
|
|
: std::max(UNIQUES_HASH_SET_INITIAL_SIZE_DEGREE, static_cast<int>(log2(m_size - 1)) + 2);
|
|
|
|
alloc(new_size_degree);
|
|
|
|
for (size_t i = 0; i < m_size; ++i)
|
|
{
|
|
HashValue x = 0;
|
|
DB::assertChar(',', rb);
|
|
DB::readIntText(x, rb);
|
|
if (x == 0)
|
|
has_zero = true;
|
|
else
|
|
reinsertImpl(x);
|
|
}
|
|
}
|
|
|
|
void insertHash(HashValue hash_value)
|
|
{
|
|
if (!good(hash_value))
|
|
return;
|
|
|
|
insertImpl(hash_value);
|
|
shrinkIfNeed();
|
|
}
|
|
|
|
#ifdef UNIQUES_HASH_SET_COUNT_COLLISIONS
|
|
size_t getCollisions() const
|
|
{
|
|
return collisions;
|
|
}
|
|
#endif
|
|
};
|
|
|
|
|
|
#undef UNIQUES_HASH_MAX_SIZE_DEGREE
|
|
#undef UNIQUES_HASH_MAX_SIZE
|
|
#undef UNIQUES_HASH_BITS_FOR_SKIP
|
|
#undef UNIQUES_HASH_SET_INITIAL_SIZE_DEGREE
|