mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-18 05:32:52 +00:00
26ab5dd7a7
This is the first step of allowing heterogeneous cells in hash tables. performance test results are ``` 1. HashMap<UInt16, UInt8, TrivialHash, HashTableFixedGrower<16>>; 2. NewLookupMap<UInt16, UInt8> ResolutionWidth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................223550276.46 ResolutionWidth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................248772721.24 Best: 2 - 24877272124 ResolutionWidth 100000 1 ..........................................................................................................................................................................................................................................................238498413.99 ResolutionWidth 100000 2 ..........................................................................................................................................................................................................................................................261808889.98 Best: 2 - 26180888998 ResolutionWidth 300000 1 ...................................................................................239307348.81 ResolutionWidth 300000 2 ...................................................................................257592761.30 Best: 2 - 25759276130 ResolutionWidth 1000000 1 .........................240144759.26 ResolutionWidth 1000000 2 .........................257093531.91 Best: 2 - 25709353191 ResolutionWidth 5000000 1 .....241573260.35 ResolutionWidth 5000000 2 .....259314162.79 Best: 2 - 25931416279 ResolutionDepth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................217108119.84 ResolutionDepth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................249459504.41 Best: 2 - 24945950441 ResolutionDepth 100000 1 ..........................................................................................................................................................................................................................................................229065162.17 ResolutionDepth 100000 2 ..........................................................................................................................................................................................................................................................253769105.64 Best: 2 - 25376910564 ResolutionDepth 300000 1 ...................................................................................233079225.18 ResolutionDepth 300000 2 ...................................................................................256316273.78 Best: 2 - 25631627378 ResolutionDepth 1000000 1 .........................234184633.51 ResolutionDepth 1000000 2 .........................261100491.57 Best: 2 - 26110049157 ResolutionDepth 5000000 1 .....233118795.66 ResolutionDepth 5000000 2 .....252436160.41 Best: 2 - 25243616041 ```
189 lines
4.1 KiB
C++
189 lines
4.1 KiB
C++
#pragma once
|
|
|
|
#include <Core/Types.h>
|
|
#include <Common/HashTable/HashMap.h>
|
|
#include <Common/Arena.h>
|
|
#include <ext/bit_cast.h>
|
|
#include <common/StringRef.h>
|
|
|
|
|
|
namespace DB
|
|
{
|
|
|
|
|
|
class MarkovModel
|
|
{
|
|
private:
|
|
using NGramHash = UInt32;
|
|
|
|
struct HistogramElement
|
|
{
|
|
UInt8 byte;
|
|
UInt32 count;
|
|
};
|
|
|
|
struct Histogram
|
|
{
|
|
UInt32 total = 0;
|
|
std::vector<HistogramElement> data;
|
|
|
|
void add(UInt8 byte)
|
|
{
|
|
++total;
|
|
|
|
for (auto & elem : data)
|
|
{
|
|
if (elem.byte == byte)
|
|
{
|
|
++elem.count;
|
|
return;
|
|
}
|
|
}
|
|
|
|
data.emplace_back(HistogramElement{.byte = byte, .count = 1});
|
|
}
|
|
|
|
UInt8 sample(UInt32 random) const
|
|
{
|
|
random %= total;
|
|
|
|
UInt32 sum = 0;
|
|
for (const auto & elem : data)
|
|
{
|
|
sum += elem.count;
|
|
if (sum > random)
|
|
return elem.byte;
|
|
}
|
|
|
|
__builtin_unreachable();
|
|
}
|
|
};
|
|
|
|
using Table = HashMap<NGramHash, Histogram, TrivialHash>;
|
|
Table table;
|
|
|
|
size_t n;
|
|
|
|
|
|
NGramHash hashContext(const char * pos, const char * data, size_t size) const
|
|
{
|
|
if (pos >= data + n)
|
|
return CRC32Hash()(StringRef(pos - n, n));
|
|
else
|
|
return CRC32Hash()(StringRef(data, pos - data));
|
|
}
|
|
|
|
public:
|
|
explicit MarkovModel(size_t n_) : n(n_) {}
|
|
MarkovModel() {}
|
|
|
|
void consume(const char * data, size_t size)
|
|
{
|
|
const char * pos = data;
|
|
const char * end = data + size;
|
|
|
|
while (pos < end)
|
|
{
|
|
table[hashContext(pos, data, size)].add(*pos);
|
|
++pos;
|
|
}
|
|
|
|
/// Mark end of string as zero byte.
|
|
table[hashContext(pos, data, size)].add(0);
|
|
}
|
|
|
|
|
|
template <typename Random>
|
|
size_t generate(char * data, size_t size, Random && random) const
|
|
{
|
|
char * pos = data;
|
|
char * end = data + size;
|
|
|
|
while (pos < end)
|
|
{
|
|
auto it = table.find(hashContext(pos, data, size));
|
|
if (table.end() == it)
|
|
return pos - data;
|
|
|
|
*pos = it->getSecond().sample(random());
|
|
|
|
/// Zero byte marks end of string.
|
|
if (0 == *pos)
|
|
return pos - data;
|
|
|
|
++pos;
|
|
}
|
|
|
|
return size;
|
|
}
|
|
|
|
|
|
/// Allows to add random noise to frequencies.
|
|
template <typename Transform>
|
|
void modifyCounts(Transform && transform)
|
|
{
|
|
for (auto & elem : table)
|
|
{
|
|
UInt32 new_total = 0;
|
|
for (auto & frequency : elem.getSecond().data)
|
|
{
|
|
frequency.count = transform(frequency.count);
|
|
new_total += frequency.count;
|
|
}
|
|
elem.getSecond().total = new_total;
|
|
}
|
|
}
|
|
|
|
|
|
void write(WriteBuffer & out) const
|
|
{
|
|
writeBinary(UInt8(n), out);
|
|
writeVarUInt(table.size(), out);
|
|
|
|
for (const auto & elem : table)
|
|
{
|
|
writeBinary(elem.getFirst(), out);
|
|
writeBinary(UInt8(elem.getSecond().data.size()), out);
|
|
|
|
for (const auto & frequency : elem.getSecond().data)
|
|
{
|
|
writeBinary(frequency.byte, out);
|
|
writeVarUInt(frequency.count, out);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void read(ReadBuffer & in)
|
|
{
|
|
table.clear();
|
|
|
|
UInt8 read_n = 0;
|
|
readBinary(read_n, in);
|
|
n = read_n;
|
|
|
|
size_t read_size = 0;
|
|
readVarUInt(read_size, in);
|
|
|
|
for (size_t i = 0; i < read_size; ++i)
|
|
{
|
|
NGramHash key = 0;
|
|
UInt8 historgam_size = 0;
|
|
readBinary(key, in);
|
|
readBinary(historgam_size, in);
|
|
|
|
Histogram & histogram = table[key];
|
|
histogram.data.resize(historgam_size);
|
|
|
|
for (size_t j = 0; j < historgam_size; ++j)
|
|
{
|
|
readBinary(histogram.data[j].byte, in);
|
|
readVarUInt(histogram.data[j].count, in);
|
|
histogram.total += histogram.data[j].count;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
}
|