From 6a454ed6c3e959d1b7ea6cb9af2e74e638cc8b7d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:37:28 +0000 Subject: [PATCH] Add Entropies typedef --- src/Functions/EntropyLearnedHash.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index c42f0f65192..f7ea42dc55c 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -30,6 +30,7 @@ namespace using Key = String; using PartialKeyPositions = std::vector; +using Entropies = std::vector; Key getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, Key & result) { @@ -99,23 +100,18 @@ std::pair nextByte(const std::vector & keys, size_t max_len // return nextByte(keys, max_len, chosen_bytes); // } -std::pair> chooseBytes(const std::vector & train_data) +std::pair chooseBytes(const std::vector & train_data) { if (train_data.size() <= 1) return {}; - // position contains numbers of chosen bytes PartialKeyPositions partial_key_positions; + Entropies entropies; - // entropies contains entropies of keys after each new chosen byte - std::vector entropies; - - // max_len is a maximal length of any key in train_data - size_t max_len = 0; + size_t max_len = 0; /// length of the longest key in training data for (const auto & key : train_data) max_len = std::max(max_len, key.size()); - // while not all partial keys unique, choose new byte and recalculate the entropy while (!allPartialKeysAreUnique(train_data, partial_key_positions)) { auto [new_position, new_entropy] = nextByte(train_data, max_len, partial_key_positions); @@ -198,6 +194,7 @@ public: { const size_t num_rows = col_data_string->size(); + /// TODO this does some needless copying ... chooseBytes() should ideally understand the native ColumnString representation std::vector training_data; for (size_t i = 0; i < num_rows; ++i) {