mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-15 19:02:04 +00:00
Add Entropies typedef
This commit is contained in:
parent
d2dc5e9fc8
commit
6a454ed6c3
@ -30,6 +30,7 @@ namespace
|
|||||||
|
|
||||||
using Key = String;
|
using Key = String;
|
||||||
using PartialKeyPositions = std::vector<size_t>;
|
using PartialKeyPositions = std::vector<size_t>;
|
||||||
|
using Entropies = std::vector<size_t>;
|
||||||
|
|
||||||
Key getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, Key & result)
|
Key getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, Key & result)
|
||||||
{
|
{
|
||||||
@ -99,23 +100,18 @@ std::pair<size_t, size_t> nextByte(const std::vector<Key> & keys, size_t max_len
|
|||||||
// return nextByte(keys, max_len, chosen_bytes);
|
// return nextByte(keys, max_len, chosen_bytes);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
std::pair<PartialKeyPositions, std::vector<size_t>> chooseBytes(const std::vector<Key> & train_data)
|
std::pair<PartialKeyPositions, Entropies> chooseBytes(const std::vector<Key> & train_data)
|
||||||
{
|
{
|
||||||
if (train_data.size() <= 1)
|
if (train_data.size() <= 1)
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
// position contains numbers of chosen bytes
|
|
||||||
PartialKeyPositions partial_key_positions;
|
PartialKeyPositions partial_key_positions;
|
||||||
|
Entropies entropies;
|
||||||
|
|
||||||
// entropies contains entropies of keys after each new chosen byte
|
size_t max_len = 0; /// length of the longest key in training data
|
||||||
std::vector<size_t> entropies;
|
|
||||||
|
|
||||||
// max_len is a maximal length of any key in train_data
|
|
||||||
size_t max_len = 0;
|
|
||||||
for (const auto & key : train_data)
|
for (const auto & key : train_data)
|
||||||
max_len = std::max(max_len, key.size());
|
max_len = std::max(max_len, key.size());
|
||||||
|
|
||||||
// while not all partial keys unique, choose new byte and recalculate the entropy
|
|
||||||
while (!allPartialKeysAreUnique(train_data, partial_key_positions))
|
while (!allPartialKeysAreUnique(train_data, partial_key_positions))
|
||||||
{
|
{
|
||||||
auto [new_position, new_entropy] = nextByte(train_data, max_len, partial_key_positions);
|
auto [new_position, new_entropy] = nextByte(train_data, max_len, partial_key_positions);
|
||||||
@ -198,6 +194,7 @@ public:
|
|||||||
{
|
{
|
||||||
const size_t num_rows = col_data_string->size();
|
const size_t num_rows = col_data_string->size();
|
||||||
|
|
||||||
|
/// TODO this does some needless copying ... chooseBytes() should ideally understand the native ColumnString representation
|
||||||
std::vector<Key> training_data;
|
std::vector<Key> training_data;
|
||||||
for (size_t i = 0; i < num_rows; ++i)
|
for (size_t i = 0; i < num_rows; ++i)
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user