ClickHouse/utils/test-data-generator/MarkovModel.h
Amos Bird 26ab5dd7a7 A Proper lookup table that uses HashTable's API
This is the first step of allowing heterogeneous cells in hash tables.

performance test results are

```

1. HashMap<UInt16, UInt8, TrivialHash, HashTableFixedGrower<16>>;
2. NewLookupMap<UInt16, UInt8>

ResolutionWidth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................223550276.46
ResolutionWidth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................248772721.24
Best: 2 - 24877272124

ResolutionWidth 100000 1 ..........................................................................................................................................................................................................................................................238498413.99
ResolutionWidth 100000 2 ..........................................................................................................................................................................................................................................................261808889.98
Best: 2 - 26180888998

ResolutionWidth 300000 1 ...................................................................................239307348.81
ResolutionWidth 300000 2 ...................................................................................257592761.30
Best: 2 - 25759276130

ResolutionWidth 1000000 1 .........................240144759.26
ResolutionWidth 1000000 2 .........................257093531.91
Best: 2 - 25709353191

ResolutionWidth 5000000 1 .....241573260.35
ResolutionWidth 5000000 2 .....259314162.79
Best: 2 - 25931416279

ResolutionDepth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................217108119.84
ResolutionDepth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................249459504.41
Best: 2 - 24945950441

ResolutionDepth 100000 1 ..........................................................................................................................................................................................................................................................229065162.17
ResolutionDepth 100000 2 ..........................................................................................................................................................................................................................................................253769105.64
Best: 2 - 25376910564

ResolutionDepth 300000 1 ...................................................................................233079225.18
ResolutionDepth 300000 2 ...................................................................................256316273.78
Best: 2 - 25631627378

ResolutionDepth 1000000 1 .........................234184633.51
ResolutionDepth 1000000 2 .........................261100491.57
Best: 2 - 26110049157

ResolutionDepth 5000000 1 .....233118795.66
ResolutionDepth 5000000 2 .....252436160.41
Best: 2 - 25243616041

```
2019-03-01 16:47:13 +08:00

189 lines
4.1 KiB
C++

#pragma once
#include <Core/Types.h>
#include <Common/HashTable/HashMap.h>
#include <Common/Arena.h>
#include <ext/bit_cast.h>
#include <common/StringRef.h>
namespace DB
{
class MarkovModel
{
private:
using NGramHash = UInt32;
struct HistogramElement
{
UInt8 byte;
UInt32 count;
};
struct Histogram
{
UInt32 total = 0;
std::vector<HistogramElement> data;
void add(UInt8 byte)
{
++total;
for (auto & elem : data)
{
if (elem.byte == byte)
{
++elem.count;
return;
}
}
data.emplace_back(HistogramElement{.byte = byte, .count = 1});
}
UInt8 sample(UInt32 random) const
{
random %= total;
UInt32 sum = 0;
for (const auto & elem : data)
{
sum += elem.count;
if (sum > random)
return elem.byte;
}
__builtin_unreachable();
}
};
using Table = HashMap<NGramHash, Histogram, TrivialHash>;
Table table;
size_t n;
NGramHash hashContext(const char * pos, const char * data, size_t size) const
{
if (pos >= data + n)
return CRC32Hash()(StringRef(pos - n, n));
else
return CRC32Hash()(StringRef(data, pos - data));
}
public:
explicit MarkovModel(size_t n_) : n(n_) {}
MarkovModel() {}
void consume(const char * data, size_t size)
{
const char * pos = data;
const char * end = data + size;
while (pos < end)
{
table[hashContext(pos, data, size)].add(*pos);
++pos;
}
/// Mark end of string as zero byte.
table[hashContext(pos, data, size)].add(0);
}
template <typename Random>
size_t generate(char * data, size_t size, Random && random) const
{
char * pos = data;
char * end = data + size;
while (pos < end)
{
auto it = table.find(hashContext(pos, data, size));
if (table.end() == it)
return pos - data;
*pos = it->getSecond().sample(random());
/// Zero byte marks end of string.
if (0 == *pos)
return pos - data;
++pos;
}
return size;
}
/// Allows to add random noise to frequencies.
template <typename Transform>
void modifyCounts(Transform && transform)
{
for (auto & elem : table)
{
UInt32 new_total = 0;
for (auto & frequency : elem.getSecond().data)
{
frequency.count = transform(frequency.count);
new_total += frequency.count;
}
elem.getSecond().total = new_total;
}
}
void write(WriteBuffer & out) const
{
writeBinary(UInt8(n), out);
writeVarUInt(table.size(), out);
for (const auto & elem : table)
{
writeBinary(elem.getFirst(), out);
writeBinary(UInt8(elem.getSecond().data.size()), out);
for (const auto & frequency : elem.getSecond().data)
{
writeBinary(frequency.byte, out);
writeVarUInt(frequency.count, out);
}
}
}
void read(ReadBuffer & in)
{
table.clear();
UInt8 read_n = 0;
readBinary(read_n, in);
n = read_n;
size_t read_size = 0;
readVarUInt(read_size, in);
for (size_t i = 0; i < read_size; ++i)
{
NGramHash key = 0;
UInt8 historgam_size = 0;
readBinary(key, in);
readBinary(historgam_size, in);
Histogram & histogram = table[key];
histogram.data.resize(historgam_size);
for (size_t j = 0; j < historgam_size; ++j)
{
readBinary(histogram.data[j].byte, in);
readVarUInt(histogram.data[j].count, in);
histogram.total += histogram.data[j].count;
}
}
}
};
}