From 500f3d395117b46889c7f6b0ff2fb78d67e1a0f6 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 10:18:55 +0000 Subject: [PATCH 01/30] Add SQL functions for Entropy Learned Hashing Courtesy to @Dmitry909, I just wrapped up his work. --- src/Functions/EntropyLearnedHash.cpp | 286 ++++++++++++++++++ ...new_functions_must_be_documented.reference | 2 + .../02734_entropy_learned_hashing.reference | 12 + .../02734_entropy_learned_hashing.sql | 24 ++ 4 files changed, 324 insertions(+) create mode 100644 src/Functions/EntropyLearnedHash.cpp create mode 100644 tests/queries/0_stateless/02734_entropy_learned_hashing.reference create mode 100644 tests/queries/0_stateless/02734_entropy_learned_hashing.sql diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp new file mode 100644 index 00000000000..3bb99e51c48 --- /dev/null +++ b/src/Functions/EntropyLearnedHash.cpp @@ -0,0 +1,286 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/// Implementation of entropy-learned hashing: https://dl.acm.org/doi/10.1145/3514221.3517894 +/// TODOs for future work: +/// - allow to specify an arbitrary hash funtion (currently always CityHash is used) +/// - allow function chaining a la entropyLearnedHash(trainEntropyLearnedHash()) + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int ILLEGAL_COLUMN; +} + +namespace EntropyLearnedHashing +{ + +using Key = String; +using PartialKeyPositions = std::vector; + +namespace +{ + +Key getPartialKey(const Key & key, const std::vector & positions) +{ + Key result_key; + result_key.reserve(positions.size()); + for (auto position : positions) + if (position < key.size()) + result_key.push_back(key[position]); + return result_key; +} + +bool allPartialKeysAreUnique(const std::vector & data, const std::vector & positions) +{ + std::unordered_set partial_keys; + partial_keys.reserve(data.size()); + for (const auto & key : data) + if (!partial_keys.insert(EntropyLearnedHashing::getPartialKey(key, positions)).second) + return false; + return true; +} + +// NextByte returns position of byte which adds the most entropy and the new entropy +std::pair nextByte(const std::vector & keys, size_t max_len, std::vector & chosen_bytes) +{ + size_t min_collisions = std::numeric_limits::max(); + size_t best_position = 0; + + std::unordered_map count_table; + for (size_t i = 0; i < max_len; ++i) + { + count_table.clear(); + count_table.reserve(keys.size()); + + chosen_bytes.push_back(i); + size_t collisions = 0; + for (const auto & key : keys) + { + auto partial_key = EntropyLearnedHashing::getPartialKey(key, chosen_bytes); + collisions += count_table[partial_key]++; + } + + if (collisions < min_collisions) + { + min_collisions = collisions; + best_position = i; + } + chosen_bytes.pop_back(); + } + return {best_position, min_collisions}; +} + +// std::pair nextByte(const std::vector & keys, std::vector & chosen_bytes) +// { +// size_t max_len = 0; +// for (const auto & key : keys) +// max_len = std::max(max_len, key.size()); + +// return nextByte(keys, max_len, chosen_bytes); +// } + +class IdManager +{ +public: + static IdManager & instance() + { + static IdManager instance; + return instance; + } + void setPartialKeyPositionsForId(const String & user_name, const String & id, const PartialKeyPositions & partial_key_positions) + { + std::lock_guard lock(mutex); + /// partial_key_positions_by_id[id] = partial_key_positions; + auto & ids_for_user = partial_key_positions_by_id[user_name]; + ids_for_user[id] = partial_key_positions; + } + const PartialKeyPositions & getPartialKeyPositionsForId(const String & user_name, const String & id) const + { + std::lock_guard lock(mutex); + auto it_user = partial_key_positions_by_id.find(user_name); + if (it_user == partial_key_positions_by_id.end()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Id {} not registered for user in entropy learned hashing", id); + auto it_id = it_user->second.find(id); + if (it_id == it_user->second.end()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Id {} not registered for user in entropy learned hashing", id); + return it_id->second; + } + +private: + mutable std::mutex mutex; + /// Map: user name --> (Map: dataset id --> byte positions to hash) + std::map> partial_key_positions_by_id TSA_GUARDED_BY(mutex); +}; + +std::pair, std::vector> chooseBytes(const std::vector & train_data) +{ + if (train_data.size() <= 1) + return {}; + + // position contains numbers of chosen bytes + std::vector positions; + + // entropies contains entropies of keys after each new chosen byte + std::vector entropies; + + // max_len is a maximal length of any key in train_data + size_t max_len = 0; + for (const auto & key : train_data) + max_len = std::max(max_len, key.size()); + + // while not all partial keys unique, choose new byte and recalculate the entropy + while (!allPartialKeysAreUnique(train_data, positions)) + { + auto [new_position, new_entropy] = nextByte(train_data, max_len, positions); + if (!entropies.empty() && new_entropy == entropies.back()) + break; + positions.push_back(new_position); + entropies.push_back(new_entropy); + } + return {positions, entropies}; +} + +} + +class FunctionTrainEntropyLearnedHash : public IFunction +{ +public: + static constexpr auto name = "trainEntropyLearnedHash"; + static FunctionPtr create(ContextPtr context) { return std::make_shared(context->getUserName()); } + explicit FunctionTrainEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {} + + String getName() const override { return name; } + bool isVariadic() const override { return false; } + size_t getNumberOfArguments() const override { return 2; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + FunctionArgumentDescriptors args{ + {"data", &isString, nullptr, "String"}, + {"id", &isString, nullptr, "String"} + }; + + validateFunctionArgumentTypes(*this, arguments, args); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override + { + const IColumn * id_col = arguments[1].column.get(); + const ColumnConst * id_col_const = checkAndGetColumn(id_col); + const String id = id_col_const->getValue(); + + const auto * data_col = arguments[0].column.get(); + if (const ColumnString * col_data_string = checkAndGetColumn(data_col)) + { + const size_t num_rows = col_data_string->size(); + + std::vector training_data; + for (size_t i = 0; i < num_rows; ++i) + { + std::string_view string_ref = col_data_string->getDataAt(i).toView(); + training_data.emplace_back(string_ref.data(), string_ref.size()); + } + + EntropyLearnedHashing::PartialKeyPositions partial_key_positions = EntropyLearnedHashing::chooseBytes(training_data).first; + auto & id_manager = EntropyLearnedHashing::IdManager::instance(); + id_manager.setPartialKeyPositionsForId(user_name, id, partial_key_positions); + + return result_type->createColumnConst(num_rows, 0u)->convertToFullColumnIfConst(); + } + else + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", + arguments.begin()->column->getName(), getName()); + } +private: + const String user_name; +}; + + +class FunctionEntropyLearnedHash : public IFunction +{ +public: + static constexpr auto name = "entropyLearnedHash"; + static FunctionPtr create(ContextPtr context) { return std::make_shared(context->getUserName()); } + explicit FunctionEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {} + + String getName() const override { return name; } + bool isVariadic() const override { return false; } + size_t getNumberOfArguments() const override { return 2; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + FunctionArgumentDescriptors args{ + {"data", &isString, nullptr, "String"}, + {"id", &isString, nullptr, "String"} + }; + + validateFunctionArgumentTypes(*this, arguments, args); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override + { + const IColumn * id_col = arguments.back().column.get(); + const ColumnConst * id_col_const = checkAndGetColumn(id_col); + const String id = id_col_const->getValue(); + + const auto & id_manager = EntropyLearnedHashing::IdManager::instance(); + const auto & partial_key_positions = id_manager.getPartialKeyPositionsForId(user_name, id); + + const auto * data_col = arguments[0].column.get(); + if (const auto * col_data_string = checkAndGetColumn(data_col)) + { + const size_t num_rows = col_data_string->size(); + auto col_res = ColumnUInt64::create(num_rows); + + auto & col_res_vec = col_res->getData(); + for (size_t i = 0; i < num_rows; ++i) + { + std::string_view string_ref = col_data_string->getDataAt(i).toView(); + EntropyLearnedHashing::Key key(string_ref.data(), string_ref.size()); + EntropyLearnedHashing::Key partial_key = EntropyLearnedHashing::getPartialKey(key, partial_key_positions); + col_res_vec[i] = CityHash_v1_0_2::CityHash64(partial_key.data(), partial_key.size()); + } + + return col_res; + } + else + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", + arguments.begin()->column->getName(), getName()); + } +private: + const String user_name; +}; + +REGISTER_FUNCTION(Hashing) +{ + factory.registerFunction(); + factory.registerFunction(); +} + +} + +} diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index eb574c4e4f5..315cf7fc0d4 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -265,6 +265,7 @@ encodeURLComponent encodeURLFormComponent encodeXMLComponent endsWith +entropyLearnedHash equals erf erfc @@ -860,6 +861,7 @@ toYear toYearWeek today tokens +trainEntropyLearnedHash transactionID transactionLatestSnapshot transactionOldestSnapshot diff --git a/tests/queries/0_stateless/02734_entropy_learned_hashing.reference b/tests/queries/0_stateless/02734_entropy_learned_hashing.reference new file mode 100644 index 00000000000..2240da7fea4 --- /dev/null +++ b/tests/queries/0_stateless/02734_entropy_learned_hashing.reference @@ -0,0 +1,12 @@ +0 +0 +0 +2603192927274642682 +4947675599669400333 +10783339242466472992 +0 +0 +0 +2603192927274642682 +4947675599669400333 +10783339242466472992 diff --git a/tests/queries/0_stateless/02734_entropy_learned_hashing.sql b/tests/queries/0_stateless/02734_entropy_learned_hashing.sql new file mode 100644 index 00000000000..d50883f3cf8 --- /dev/null +++ b/tests/queries/0_stateless/02734_entropy_learned_hashing.sql @@ -0,0 +1,24 @@ +-- Tags: no-parallel +-- Tag no-parallel: Accesses global state + +DROP TABLE IF EXISTS tbl1; +CREATE TABLE tbl1 (x String) ENGINE=Memory; +-- no commonalities between keys +INSERT INTO tbl1 VALUES ('a'), ('b'), ('c'); +SELECT trainEntropyLearnedHash(x, 'id1') FROM tbl1; +SELECT entropyLearnedHash(x, 'id1') FROM tbl1; + +SELECT trainEntropyLearnedHash(x, 1) FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +-- SELECT trainEntropyLearnedHash(x, NULL) FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT trainEntropyLearnedHash(1, 'id1') FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT entropyLearnedHash(x, 'non-existing id') FROM tbl1; -- { serverError BAD_ARGUMENTS } + +DROP TABLE IF EXISTS tbl2; +CREATE TABLE tbl2 (x String) ENGINE=Memory; +-- with commonalities between keys +INSERT INTO tbl2 VALUES ('aa'), ('ba'), ('ca'); +SELECT trainEntropyLearnedHash(x, 'id1') FROM tbl2; +SELECT entropyLearnedHash(x, 'id1') FROM tbl2; + +DROP TABLE tbl1; +DROP TABLE tbl2; From bb5a25e81c9cec334693bbc7e7b3b0f14c6f107b Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:05:44 +0000 Subject: [PATCH 02/30] Fix typo --- src/Functions/EntropyLearnedHash.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index 3bb99e51c48..921b7921e3b 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -11,7 +11,7 @@ /// Implementation of entropy-learned hashing: https://dl.acm.org/doi/10.1145/3514221.3517894 /// TODOs for future work: -/// - allow to specify an arbitrary hash funtion (currently always CityHash is used) +/// - allow to specify an arbitrary hash function (currently always CityHash is used) /// - allow function chaining a la entropyLearnedHash(trainEntropyLearnedHash()) From b9e8c52057a584991ce9914eaae00a230431c5c4 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:08:22 +0000 Subject: [PATCH 03/30] Fix function registration --- src/Functions/EntropyLearnedHash.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index 921b7921e3b..1b9f8d0772b 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -275,7 +275,7 @@ private: const String user_name; }; -REGISTER_FUNCTION(Hashing) +REGISTER_FUNCTION(EntropyLearnedHash) { factory.registerFunction(); factory.registerFunction(); From 267e0c4ef5085a2b18d15b2ad5a8402f7c3912bc Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:10:06 +0000 Subject: [PATCH 04/30] More typedef usage --- src/Functions/EntropyLearnedHash.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index 1b9f8d0772b..873ce9a0386 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -33,22 +33,22 @@ using PartialKeyPositions = std::vector; namespace { -Key getPartialKey(const Key & key, const std::vector & positions) +Key getPartialKey(const Key & key, const PartialKeyPositions & partial_key_positions) { Key result_key; - result_key.reserve(positions.size()); - for (auto position : positions) - if (position < key.size()) - result_key.push_back(key[position]); + result_key.reserve(partial_key_positions.size()); + for (auto partial_key_position : partial_key_positions) + if (partial_key_position < key.size()) + result_key.push_back(key[partial_key_position]); return result_key; } -bool allPartialKeysAreUnique(const std::vector & data, const std::vector & positions) +bool allPartialKeysAreUnique(const std::vector & data, const PartialKeyPositions & partial_key_positions) { std::unordered_set partial_keys; partial_keys.reserve(data.size()); for (const auto & key : data) - if (!partial_keys.insert(EntropyLearnedHashing::getPartialKey(key, positions)).second) + if (!partial_keys.insert(EntropyLearnedHashing::getPartialKey(key, partial_key_positions)).second) return false; return true; } @@ -125,13 +125,13 @@ private: std::map> partial_key_positions_by_id TSA_GUARDED_BY(mutex); }; -std::pair, std::vector> chooseBytes(const std::vector & train_data) +std::pair> chooseBytes(const std::vector & train_data) { if (train_data.size() <= 1) return {}; // position contains numbers of chosen bytes - std::vector positions; + PartialKeyPositions partial_key_positions; // entropies contains entropies of keys after each new chosen byte std::vector entropies; @@ -142,15 +142,15 @@ std::pair, std::vector> chooseBytes(const std::vecto max_len = std::max(max_len, key.size()); // while not all partial keys unique, choose new byte and recalculate the entropy - while (!allPartialKeysAreUnique(train_data, positions)) + while (!allPartialKeysAreUnique(train_data, partial_key_positions)) { - auto [new_position, new_entropy] = nextByte(train_data, max_len, positions); + auto [new_position, new_entropy] = nextByte(train_data, max_len, partial_key_positions); if (!entropies.empty() && new_entropy == entropies.back()) break; - positions.push_back(new_position); + partial_key_positions.push_back(new_position); entropies.push_back(new_entropy); } - return {positions, entropies}; + return {partial_key_positions, entropies}; } } From 03e9522de42141af75302fbc483a79fb603fe72d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:13:52 +0000 Subject: [PATCH 05/30] Less namespace clutter --- src/Functions/EntropyLearnedHash.cpp | 31 ++++++++++++---------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index 873ce9a0386..8b9fc592696 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -24,9 +24,6 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; } -namespace EntropyLearnedHashing -{ - using Key = String; using PartialKeyPositions = std::vector; @@ -43,23 +40,23 @@ Key getPartialKey(const Key & key, const PartialKeyPositions & partial_key_posit return result_key; } -bool allPartialKeysAreUnique(const std::vector & data, const PartialKeyPositions & partial_key_positions) +bool allPartialKeysAreUnique(const std::vector & data, const PartialKeyPositions & partial_key_positions) { - std::unordered_set partial_keys; + std::unordered_set partial_keys; partial_keys.reserve(data.size()); for (const auto & key : data) - if (!partial_keys.insert(EntropyLearnedHashing::getPartialKey(key, partial_key_positions)).second) + if (!partial_keys.insert(getPartialKey(key, partial_key_positions)).second) return false; return true; } // NextByte returns position of byte which adds the most entropy and the new entropy -std::pair nextByte(const std::vector & keys, size_t max_len, std::vector & chosen_bytes) +std::pair nextByte(const std::vector & keys, size_t max_len, std::vector & chosen_bytes) { size_t min_collisions = std::numeric_limits::max(); size_t best_position = 0; - std::unordered_map count_table; + std::unordered_map count_table; for (size_t i = 0; i < max_len; ++i) { count_table.clear(); @@ -69,7 +66,7 @@ std::pair nextByte(const std::vector size_t collisions = 0; for (const auto & key : keys) { - auto partial_key = EntropyLearnedHashing::getPartialKey(key, chosen_bytes); + auto partial_key = getPartialKey(key, chosen_bytes); collisions += count_table[partial_key]++; } @@ -83,7 +80,7 @@ std::pair nextByte(const std::vector return {best_position, min_collisions}; } -// std::pair nextByte(const std::vector & keys, std::vector & chosen_bytes) +// std::pair nextByte(const std::vector & keys, std::vector & chosen_bytes) // { // size_t max_len = 0; // for (const auto & key : keys) @@ -192,15 +189,15 @@ public: { const size_t num_rows = col_data_string->size(); - std::vector training_data; + std::vector training_data; for (size_t i = 0; i < num_rows; ++i) { std::string_view string_ref = col_data_string->getDataAt(i).toView(); training_data.emplace_back(string_ref.data(), string_ref.size()); } - EntropyLearnedHashing::PartialKeyPositions partial_key_positions = EntropyLearnedHashing::chooseBytes(training_data).first; - auto & id_manager = EntropyLearnedHashing::IdManager::instance(); + PartialKeyPositions partial_key_positions = chooseBytes(training_data).first; + auto & id_manager = IdManager::instance(); id_manager.setPartialKeyPositionsForId(user_name, id, partial_key_positions); return result_type->createColumnConst(num_rows, 0u)->convertToFullColumnIfConst(); @@ -247,7 +244,7 @@ public: const ColumnConst * id_col_const = checkAndGetColumn(id_col); const String id = id_col_const->getValue(); - const auto & id_manager = EntropyLearnedHashing::IdManager::instance(); + const auto & id_manager = IdManager::instance(); const auto & partial_key_positions = id_manager.getPartialKeyPositionsForId(user_name, id); const auto * data_col = arguments[0].column.get(); @@ -260,8 +257,8 @@ public: for (size_t i = 0; i < num_rows; ++i) { std::string_view string_ref = col_data_string->getDataAt(i).toView(); - EntropyLearnedHashing::Key key(string_ref.data(), string_ref.size()); - EntropyLearnedHashing::Key partial_key = EntropyLearnedHashing::getPartialKey(key, partial_key_positions); + Key key(string_ref.data(), string_ref.size()); + Key partial_key = getPartialKey(key, partial_key_positions); col_res_vec[i] = CityHash_v1_0_2::CityHash64(partial_key.data(), partial_key.size()); } @@ -282,5 +279,3 @@ REGISTER_FUNCTION(EntropyLearnedHash) } } - -} From fdabce9a6818249e1ea7f75feeaf4c59708efb4f Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:15:08 +0000 Subject: [PATCH 06/30] Move chooseBytes() up --- src/Functions/EntropyLearnedHash.cpp | 62 ++++++++++++++-------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index 8b9fc592696..6cb4d7350a5 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -24,12 +24,12 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; } -using Key = String; -using PartialKeyPositions = std::vector; - namespace { +using Key = String; +using PartialKeyPositions = std::vector; + Key getPartialKey(const Key & key, const PartialKeyPositions & partial_key_positions) { Key result_key; @@ -89,6 +89,34 @@ std::pair nextByte(const std::vector & keys, size_t max_len // return nextByte(keys, max_len, chosen_bytes); // } +std::pair> chooseBytes(const std::vector & train_data) +{ + if (train_data.size() <= 1) + return {}; + + // position contains numbers of chosen bytes + PartialKeyPositions partial_key_positions; + + // entropies contains entropies of keys after each new chosen byte + std::vector entropies; + + // max_len is a maximal length of any key in train_data + size_t max_len = 0; + for (const auto & key : train_data) + max_len = std::max(max_len, key.size()); + + // while not all partial keys unique, choose new byte and recalculate the entropy + while (!allPartialKeysAreUnique(train_data, partial_key_positions)) + { + auto [new_position, new_entropy] = nextByte(train_data, max_len, partial_key_positions); + if (!entropies.empty() && new_entropy == entropies.back()) + break; + partial_key_positions.push_back(new_position); + entropies.push_back(new_entropy); + } + return {partial_key_positions, entropies}; +} + class IdManager { public: @@ -122,34 +150,6 @@ private: std::map> partial_key_positions_by_id TSA_GUARDED_BY(mutex); }; -std::pair> chooseBytes(const std::vector & train_data) -{ - if (train_data.size() <= 1) - return {}; - - // position contains numbers of chosen bytes - PartialKeyPositions partial_key_positions; - - // entropies contains entropies of keys after each new chosen byte - std::vector entropies; - - // max_len is a maximal length of any key in train_data - size_t max_len = 0; - for (const auto & key : train_data) - max_len = std::max(max_len, key.size()); - - // while not all partial keys unique, choose new byte and recalculate the entropy - while (!allPartialKeysAreUnique(train_data, partial_key_positions)) - { - auto [new_position, new_entropy] = nextByte(train_data, max_len, partial_key_positions); - if (!entropies.empty() && new_entropy == entropies.back()) - break; - partial_key_positions.push_back(new_position); - entropies.push_back(new_entropy); - } - return {partial_key_positions, entropies}; -} - } class FunctionTrainEntropyLearnedHash : public IFunction From 1b7c207d7a9dd20a50e253e28f47482d09a8a0f4 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:16:12 +0000 Subject: [PATCH 07/30] Replace ACM link by DOI link --- src/Functions/EntropyLearnedHash.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index 6cb4d7350a5..e926bd3cfcc 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -9,7 +9,7 @@ #include #include -/// Implementation of entropy-learned hashing: https://dl.acm.org/doi/10.1145/3514221.3517894 +/// Implementation of entropy-learned hashing: https://doi.org/10.1145/3514221.3517894 /// TODOs for future work: /// - allow to specify an arbitrary hash function (currently always CityHash is used) /// - allow function chaining a la entropyLearnedHash(trainEntropyLearnedHash()) From d2216a433987b6eba1902e0fd753fb0d54a6e6b5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:19:39 +0000 Subject: [PATCH 08/30] Remove leftover --- src/Functions/EntropyLearnedHash.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index e926bd3cfcc..cb3ca0b132f 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -13,6 +13,7 @@ /// TODOs for future work: /// - allow to specify an arbitrary hash function (currently always CityHash is used) /// - allow function chaining a la entropyLearnedHash(trainEntropyLearnedHash()) +/// - support more datatypes for data (besides String) namespace DB @@ -128,7 +129,6 @@ public: void setPartialKeyPositionsForId(const String & user_name, const String & id, const PartialKeyPositions & partial_key_positions) { std::lock_guard lock(mutex); - /// partial_key_positions_by_id[id] = partial_key_positions; auto & ids_for_user = partial_key_positions_by_id[user_name]; ids_for_user[id] = partial_key_positions; } From 8b77b706c40b09f849432f2c809fdc9e61940911 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:29:43 +0000 Subject: [PATCH 09/30] Optimize allocations --- src/Functions/EntropyLearnedHash.cpp | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index cb3ca0b132f..2d07198d583 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -31,23 +31,31 @@ namespace using Key = String; using PartialKeyPositions = std::vector; -Key getPartialKey(const Key & key, const PartialKeyPositions & partial_key_positions) +Key getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, Key & result) { - Key result_key; - result_key.reserve(partial_key_positions.size()); + result.clear(); + result.reserve(partial_key_positions.size()); + for (auto partial_key_position : partial_key_positions) if (partial_key_position < key.size()) - result_key.push_back(key[partial_key_position]); - return result_key; + result.push_back(key[partial_key_position]); + + return result; } bool allPartialKeysAreUnique(const std::vector & data, const PartialKeyPositions & partial_key_positions) { std::unordered_set partial_keys; partial_keys.reserve(data.size()); + Key partial_key; + for (const auto & key : data) - if (!partial_keys.insert(getPartialKey(key, partial_key_positions)).second) + { + getPartialKey(key, partial_key_positions, partial_key); + if (!partial_keys.insert(partial_key).second) return false; + } + return true; } @@ -58,6 +66,7 @@ std::pair nextByte(const std::vector & keys, size_t max_len size_t best_position = 0; std::unordered_map count_table; + Key partial_key; for (size_t i = 0; i < max_len; ++i) { count_table.clear(); @@ -67,7 +76,7 @@ std::pair nextByte(const std::vector & keys, size_t max_len size_t collisions = 0; for (const auto & key : keys) { - auto partial_key = getPartialKey(key, chosen_bytes); + getPartialKey(key, chosen_bytes, partial_key); collisions += count_table[partial_key]++; } @@ -254,11 +263,11 @@ public: auto col_res = ColumnUInt64::create(num_rows); auto & col_res_vec = col_res->getData(); + Key partial_key; for (size_t i = 0; i < num_rows; ++i) { std::string_view string_ref = col_data_string->getDataAt(i).toView(); - Key key(string_ref.data(), string_ref.size()); - Key partial_key = getPartialKey(key, partial_key_positions); + getPartialKey(string_ref, partial_key_positions, partial_key); col_res_vec[i] = CityHash_v1_0_2::CityHash64(partial_key.data(), partial_key.size()); } From d2dc5e9fc835d0dd95420d6bc50848b1d6a97ce5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:36:28 +0000 Subject: [PATCH 10/30] Improve naming --- src/Functions/EntropyLearnedHash.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index 2d07198d583..c42f0f65192 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -45,14 +45,14 @@ Key getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_ bool allPartialKeysAreUnique(const std::vector & data, const PartialKeyPositions & partial_key_positions) { - std::unordered_set partial_keys; - partial_keys.reserve(data.size()); + std::unordered_set unique_partial_keys; + unique_partial_keys.reserve(data.size()); Key partial_key; for (const auto & key : data) { getPartialKey(key, partial_key_positions, partial_key); - if (!partial_keys.insert(partial_key).second) + if (!unique_partial_keys.insert(partial_key).second) return false; } From 6a454ed6c3e959d1b7ea6cb9af2e74e638cc8b7d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:37:28 +0000 Subject: [PATCH 11/30] Add Entropies typedef --- src/Functions/EntropyLearnedHash.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index c42f0f65192..f7ea42dc55c 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -30,6 +30,7 @@ namespace using Key = String; using PartialKeyPositions = std::vector; +using Entropies = std::vector; Key getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, Key & result) { @@ -99,23 +100,18 @@ std::pair nextByte(const std::vector & keys, size_t max_len // return nextByte(keys, max_len, chosen_bytes); // } -std::pair> chooseBytes(const std::vector & train_data) +std::pair chooseBytes(const std::vector & train_data) { if (train_data.size() <= 1) return {}; - // position contains numbers of chosen bytes PartialKeyPositions partial_key_positions; + Entropies entropies; - // entropies contains entropies of keys after each new chosen byte - std::vector entropies; - - // max_len is a maximal length of any key in train_data - size_t max_len = 0; + size_t max_len = 0; /// length of the longest key in training data for (const auto & key : train_data) max_len = std::max(max_len, key.size()); - // while not all partial keys unique, choose new byte and recalculate the entropy while (!allPartialKeysAreUnique(train_data, partial_key_positions)) { auto [new_position, new_entropy] = nextByte(train_data, max_len, partial_key_positions); @@ -198,6 +194,7 @@ public: { const size_t num_rows = col_data_string->size(); + /// TODO this does some needless copying ... chooseBytes() should ideally understand the native ColumnString representation std::vector training_data; for (size_t i = 0; i < num_rows; ++i) { From e9d9eda3a2384fe0a9bc58bbe098e9ab990a8ff4 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 12:44:25 +0000 Subject: [PATCH 12/30] More typedef usage --- src/Functions/EntropyLearnedHash.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index f7ea42dc55c..6b6effa72d5 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -10,6 +10,8 @@ #include /// Implementation of entropy-learned hashing: https://doi.org/10.1145/3514221.3517894 +/// If you change something in this file, please don't deviate too much from the pseudocode in the paper! + /// TODOs for future work: /// - allow to specify an arbitrary hash function (currently always CityHash is used) /// - allow function chaining a la entropyLearnedHash(trainEntropyLearnedHash()) @@ -61,23 +63,24 @@ bool allPartialKeysAreUnique(const std::vector & data, const PartialKeyPosi } // NextByte returns position of byte which adds the most entropy and the new entropy -std::pair nextByte(const std::vector & keys, size_t max_len, std::vector & chosen_bytes) +std::pair nextByte(const std::vector & keys, size_t max_len, PartialKeyPositions & partial_key_positions) { size_t min_collisions = std::numeric_limits::max(); size_t best_position = 0; std::unordered_map count_table; Key partial_key; + for (size_t i = 0; i < max_len; ++i) { count_table.clear(); count_table.reserve(keys.size()); - chosen_bytes.push_back(i); + partial_key_positions.push_back(i); size_t collisions = 0; for (const auto & key : keys) { - getPartialKey(key, chosen_bytes, partial_key); + getPartialKey(key, partial_key_positions, partial_key); collisions += count_table[partial_key]++; } @@ -86,18 +89,19 @@ std::pair nextByte(const std::vector & keys, size_t max_len min_collisions = collisions; best_position = i; } - chosen_bytes.pop_back(); + partial_key_positions.pop_back(); } + return {best_position, min_collisions}; } -// std::pair nextByte(const std::vector & keys, std::vector & chosen_bytes) +// std::pair nextByte(const std::vector & keys, PartialKeyPositions & partial_key_positions) // { // size_t max_len = 0; // for (const auto & key : keys) // max_len = std::max(max_len, key.size()); -// return nextByte(keys, max_len, chosen_bytes); +// return nextByte(keys, max_len, partial_key_positions); // } std::pair chooseBytes(const std::vector & train_data) From 93a2b44702fbd185a1ec691c6e6036cfe83e6cbc Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 May 2023 13:11:21 +0000 Subject: [PATCH 13/30] Add docs --- .../sql-reference/functions/hash-functions.md | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index fe842732b89..1e597ed9b7c 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -558,6 +558,54 @@ Result: └───────────────────────────┘ ``` +## Entropy-learned hashing + +Entropy-learned hashing is not a standalone hash function like `metroHash64`, `cityHash64`, `sipHash64` etc. Instead, it aims to preprocess +the data to be hashed in a way that a standalone hash function can be computed more efficiently while not compromising the hash quality, +i.e. the randomness of the hashes. For that, entropy-based hashing chooses a subset of the bytes in a training data set of Strings which has +the same randomness (entropy) as the original Strings. For example, if the Strings are in average 100 bytes long, and we pick a subset of 5 +bytes, then a hash function will be 95% less expensive to evaluate. For details of the method, refer to [Entropy-Learned Hashing: Constant +Time Hashing with Controllable Uniformity](https://doi.org/10.1145/3514221.3517894). + +Entropy-learned hashing has two phases: +1. A training phase on a representative but typically small set of Strings to be hashed. Function `trainEntropyLearnedHash(data, id)` + calculates a minimal partial sub-key of `data` and stores it as `id`. +2. An evaluation phase where hashes are computed using the previously calculated partial sub-keys. Function `entropyLearnedHash(data, id)` + hashes `data` using the partial subkey stored as `id`. CityHash64 is used as hash function. + +**Syntax** + +``` sql +trainEntropyLearnedHash(data, id); +entropyLearnedHash(data, id); +``` + +**Example** + +```sql +CREATE TABLE tab (col String) ENGINE=Memory; +INSERT INTO tab VALUES ('aa'), ('ba'), ('ca'); + +SELECT trainEntropyLearnedHash(col, 'id1') AS trained FROM tab; +SELECT entropyLearnedHash(col, 'id1') as hashes FROM tab; +``` + +Result: + +``` response +┌─trained─┐ +│ 0 │ +│ 0 │ +│ 0 │ +└─────────┘ + +┌───────────────hashes─┐ +│ 2603192927274642682 │ +│ 4947675599669400333 │ +│ 10783339242466472992 │ +└──────────────────────┘ +``` + ## metroHash64 Produces a 64-bit [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/) hash value. From 37c1b1aa58a3d67c6643b0630331c65a55a87df4 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 11 May 2023 18:49:05 +0000 Subject: [PATCH 14/30] Some fixups --- .../sql-reference/functions/hash-functions.md | 2 +- src/Functions/EntropyLearnedHash.cpp | 26 ++++++------------- .../02734_entropy_learned_hashing.sql | 3 --- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 1e597ed9b7c..52bb4089b10 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -569,7 +569,7 @@ Time Hashing with Controllable Uniformity](https://doi.org/10.1145/3514221.35178 Entropy-learned hashing has two phases: 1. A training phase on a representative but typically small set of Strings to be hashed. Function `trainEntropyLearnedHash(data, id)` - calculates a minimal partial sub-key of `data` and stores it as `id`. + calculates a minimal partial sub-key of `data` and stores it as `id`. The training step outputs dummy `0` values. 2. An evaluation phase where hashes are computed using the previously calculated partial sub-keys. Function `entropyLearnedHash(data, id)` hashes `data` using the partial subkey stored as `id`. CityHash64 is used as hash function. diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index 6b6effa72d5..ed4ea8ba701 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -34,7 +34,7 @@ using Key = String; using PartialKeyPositions = std::vector; using Entropies = std::vector; -Key getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, Key & result) +void getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, Key & result) { result.clear(); result.reserve(partial_key_positions.size()); @@ -42,17 +42,15 @@ Key getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_ for (auto partial_key_position : partial_key_positions) if (partial_key_position < key.size()) result.push_back(key[partial_key_position]); - - return result; } -bool allPartialKeysAreUnique(const std::vector & data, const PartialKeyPositions & partial_key_positions) +bool allPartialKeysAreUnique(const std::vector & keys, const PartialKeyPositions & partial_key_positions) { std::unordered_set unique_partial_keys; - unique_partial_keys.reserve(data.size()); + unique_partial_keys.reserve(keys.size()); Key partial_key; - for (const auto & key : data) + for (const auto & key : keys) { getPartialKey(key, partial_key_positions, partial_key); if (!unique_partial_keys.insert(partial_key).second) @@ -69,12 +67,13 @@ std::pair nextByte(const std::vector & keys, size_t max_len size_t best_position = 0; std::unordered_map count_table; + count_table.reserve(keys.size()); + Key partial_key; for (size_t i = 0; i < max_len; ++i) { count_table.clear(); - count_table.reserve(keys.size()); partial_key_positions.push_back(i); size_t collisions = 0; @@ -95,15 +94,6 @@ std::pair nextByte(const std::vector & keys, size_t max_len return {best_position, min_collisions}; } -// std::pair nextByte(const std::vector & keys, PartialKeyPositions & partial_key_positions) -// { -// size_t max_len = 0; -// for (const auto & key : keys) -// max_len = std::max(max_len, key.size()); - -// return nextByte(keys, max_len, partial_key_positions); -// } - std::pair chooseBytes(const std::vector & train_data) { if (train_data.size() <= 1) @@ -202,8 +192,8 @@ public: std::vector training_data; for (size_t i = 0; i < num_rows; ++i) { - std::string_view string_ref = col_data_string->getDataAt(i).toView(); - training_data.emplace_back(string_ref.data(), string_ref.size()); + std::string_view string_view = col_data_string->getDataAt(i).toView(); + training_data.emplace_back(string_view.data(), string_view.size()); } PartialKeyPositions partial_key_positions = chooseBytes(training_data).first; diff --git a/tests/queries/0_stateless/02734_entropy_learned_hashing.sql b/tests/queries/0_stateless/02734_entropy_learned_hashing.sql index d50883f3cf8..2fee866710f 100644 --- a/tests/queries/0_stateless/02734_entropy_learned_hashing.sql +++ b/tests/queries/0_stateless/02734_entropy_learned_hashing.sql @@ -1,6 +1,3 @@ --- Tags: no-parallel --- Tag no-parallel: Accesses global state - DROP TABLE IF EXISTS tbl1; CREATE TABLE tbl1 (x String) ENGINE=Memory; -- no commonalities between keys From 4a168444fa7156c401730b76ae5953c16446b2e0 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 11 May 2023 19:03:17 +0000 Subject: [PATCH 15/30] Store keys as std::string_view --- src/Functions/EntropyLearnedHash.cpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index ed4ea8ba701..af5b2c3d232 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -30,11 +30,10 @@ namespace ErrorCodes namespace { -using Key = String; using PartialKeyPositions = std::vector; using Entropies = std::vector; -void getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, Key & result) +void getPartialKey(std::string_view key, const PartialKeyPositions & partial_key_positions, String & result) { result.clear(); result.reserve(partial_key_positions.size()); @@ -44,11 +43,11 @@ void getPartialKey(std::string_view key, const PartialKeyPositions & partial_key result.push_back(key[partial_key_position]); } -bool allPartialKeysAreUnique(const std::vector & keys, const PartialKeyPositions & partial_key_positions) +bool allPartialKeysAreUnique(const std::vector & keys, const PartialKeyPositions & partial_key_positions) { - std::unordered_set unique_partial_keys; + std::unordered_set unique_partial_keys; unique_partial_keys.reserve(keys.size()); - Key partial_key; + String partial_key; for (const auto & key : keys) { @@ -61,15 +60,15 @@ bool allPartialKeysAreUnique(const std::vector & keys, const PartialKeyPosi } // NextByte returns position of byte which adds the most entropy and the new entropy -std::pair nextByte(const std::vector & keys, size_t max_len, PartialKeyPositions & partial_key_positions) +std::pair nextByte(const std::vector & keys, size_t max_len, PartialKeyPositions & partial_key_positions) { size_t min_collisions = std::numeric_limits::max(); size_t best_position = 0; - std::unordered_map count_table; + std::unordered_map count_table; count_table.reserve(keys.size()); - Key partial_key; + String partial_key; for (size_t i = 0; i < max_len; ++i) { @@ -94,7 +93,7 @@ std::pair nextByte(const std::vector & keys, size_t max_len return {best_position, min_collisions}; } -std::pair chooseBytes(const std::vector & train_data) +std::pair chooseBytes(const std::vector & train_data) { if (train_data.size() <= 1) return {}; @@ -189,11 +188,11 @@ public: const size_t num_rows = col_data_string->size(); /// TODO this does some needless copying ... chooseBytes() should ideally understand the native ColumnString representation - std::vector training_data; + std::vector training_data; for (size_t i = 0; i < num_rows; ++i) { std::string_view string_view = col_data_string->getDataAt(i).toView(); - training_data.emplace_back(string_view.data(), string_view.size()); + training_data.emplace_back(string_view); } PartialKeyPositions partial_key_positions = chooseBytes(training_data).first; @@ -254,7 +253,7 @@ public: auto col_res = ColumnUInt64::create(num_rows); auto & col_res_vec = col_res->getData(); - Key partial_key; + String partial_key; for (size_t i = 0; i < num_rows; ++i) { std::string_view string_ref = col_data_string->getDataAt(i).toView(); From bbfb74ab706d003f2206e229efcc2f03c9bd1c65 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 11 May 2023 19:06:04 +0000 Subject: [PATCH 16/30] Update comment --- src/Functions/EntropyLearnedHash.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index af5b2c3d232..9c517a51f77 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -187,7 +187,6 @@ public: { const size_t num_rows = col_data_string->size(); - /// TODO this does some needless copying ... chooseBytes() should ideally understand the native ColumnString representation std::vector training_data; for (size_t i = 0; i < num_rows; ++i) { From 1f06065de7bb05eda3da98c94c7d464fab511a4c Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 12 May 2023 08:21:53 +0000 Subject: [PATCH 17/30] Try to fix test --- tests/queries/0_stateless/02734_entropy_learned_hashing.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02734_entropy_learned_hashing.sql b/tests/queries/0_stateless/02734_entropy_learned_hashing.sql index 2fee866710f..c0c07b26938 100644 --- a/tests/queries/0_stateless/02734_entropy_learned_hashing.sql +++ b/tests/queries/0_stateless/02734_entropy_learned_hashing.sql @@ -14,8 +14,8 @@ DROP TABLE IF EXISTS tbl2; CREATE TABLE tbl2 (x String) ENGINE=Memory; -- with commonalities between keys INSERT INTO tbl2 VALUES ('aa'), ('ba'), ('ca'); -SELECT trainEntropyLearnedHash(x, 'id1') FROM tbl2; -SELECT entropyLearnedHash(x, 'id1') FROM tbl2; +SELECT trainEntropyLearnedHash(x, 'id2') FROM tbl2; +SELECT entropyLearnedHash(x, 'id2') FROM tbl2; DROP TABLE tbl1; DROP TABLE tbl2; From 9fb09469f003caf704ca1548a1039c6c98a2feb6 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 13 May 2023 20:38:05 +0000 Subject: [PATCH 18/30] Fix expected results --- .../0_stateless/02734_entropy_learned_hashing.reference | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02734_entropy_learned_hashing.reference b/tests/queries/0_stateless/02734_entropy_learned_hashing.reference index 2240da7fea4..d9b014f3745 100644 --- a/tests/queries/0_stateless/02734_entropy_learned_hashing.reference +++ b/tests/queries/0_stateless/02734_entropy_learned_hashing.reference @@ -7,6 +7,6 @@ 0 0 0 -2603192927274642682 -4947675599669400333 -10783339242466472992 +11160318154034397263 +11160318154034397263 +11160318154034397263 From de49a33cb33a9265a3c431bb9ea7d359bc4f9b27 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 14 May 2023 17:36:56 +0000 Subject: [PATCH 19/30] Fix tests, pt. II --- .../0_stateless/02734_entropy_learned_hashing.reference | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02734_entropy_learned_hashing.reference b/tests/queries/0_stateless/02734_entropy_learned_hashing.reference index d9b014f3745..2240da7fea4 100644 --- a/tests/queries/0_stateless/02734_entropy_learned_hashing.reference +++ b/tests/queries/0_stateless/02734_entropy_learned_hashing.reference @@ -7,6 +7,6 @@ 0 0 0 -11160318154034397263 -11160318154034397263 -11160318154034397263 +2603192927274642682 +4947675599669400333 +10783339242466472992 From f49160ef4e99fadbff0fcfdad5b4f3e3e6e539f0 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 26 May 2023 13:46:59 +0000 Subject: [PATCH 20/30] Build partial key positions from entire training data --- .../sql-reference/functions/hash-functions.md | 39 +++- src/Functions/EntropyLearnedHash.cpp | 196 ++++++++++++++---- .../02734_entropy_learned_hashing.reference | 6 + .../02734_entropy_learned_hashing.sql | 27 ++- 4 files changed, 209 insertions(+), 59 deletions(-) diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 6758c7e0114..91c45f90018 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -560,7 +560,9 @@ Result: └───────────────────────────┘ ``` -## Entropy-learned hashing +## Entropy-learned hashing (experimental) + +Entropy-learned hashing is disabled by default, to enable: `SET allow_experimental_hash_functions=1`. Entropy-learned hashing is not a standalone hash function like `metroHash64`, `cityHash64`, `sipHash64` etc. Instead, it aims to preprocess the data to be hashed in a way that a standalone hash function can be computed more efficiently while not compromising the hash quality, @@ -570,15 +572,25 @@ bytes, then a hash function will be 95% less expensive to evaluate. For details Time Hashing with Controllable Uniformity](https://doi.org/10.1145/3514221.3517894). Entropy-learned hashing has two phases: -1. A training phase on a representative but typically small set of Strings to be hashed. Function `trainEntropyLearnedHash(data, id)` - calculates a minimal partial sub-key of `data` and stores it as `id`. The training step outputs dummy `0` values. + +1. A training phase on a representative but typically small set of Strings to be hashed. Training consists of two steps: + + - Function `prepareTrainEntropyLearnedHash(data, id)` caches the training data in a global state under a given `id`. It returns dummy + value `0` on every row. + - Function `trainEntropyLearnedHash(id)` computes a minimal partial sub-key of the training data stored stored under `id` in the global + state. The result is stored in the global state as well. It returns dummy value `0` on every row. + 2. An evaluation phase where hashes are computed using the previously calculated partial sub-keys. Function `entropyLearnedHash(data, id)` hashes `data` using the partial subkey stored as `id`. CityHash64 is used as hash function. +The reason that the training phase comprises two steps is that ClickHouse processes data at chunk granularity but entropy-learned hashing +needs to process the entire training set at once. + **Syntax** ``` sql -trainEntropyLearnedHash(data, id); +prepareTrainEntropyLearnedHash(data, id); +trainEntropyLearnedHash(id); entropyLearnedHash(data, id); ``` @@ -588,18 +600,25 @@ entropyLearnedHash(data, id); CREATE TABLE tab (col String) ENGINE=Memory; INSERT INTO tab VALUES ('aa'), ('ba'), ('ca'); -SELECT trainEntropyLearnedHash(col, 'id1') AS trained FROM tab; +SELECT prepareTrainEntropyLearnedHash(col, 'id1') AS prepared FROM tab; +SELECT trainEntropyLearnedHash('id1') AS trained FROM tab; SELECT entropyLearnedHash(col, 'id1') as hashes FROM tab; ``` Result: ``` response -┌─trained─┐ -│ 0 │ -│ 0 │ -│ 0 │ -└─────────┘ +┌─prepared─┐ +│ 0 │ +│ 0 │ +│ 0 │ +└──────────┘ + +┌─trained─┐ +│ 0 │ +│ 0 │ +│ 0 │ +└─────────┘ ┌───────────────hashes─┐ │ 2603192927274642682 │ diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index 9c517a51f77..a80a8fe0228 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -25,6 +25,7 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int ILLEGAL_COLUMN; + extern const int SUPPORT_IS_DISABLED; } namespace @@ -116,49 +117,117 @@ std::pair chooseBytes(const std::vectorinsertRangeFrom(*other_training_sample, 0, other_training_sample->size()); + } + + const ColumnString * concatenated_training_sample_string = checkAndGetColumn(*concatenated_training_sample); + if (!concatenated_training_sample_string) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column"); + + const size_t num_rows = concatenated_training_sample_string->size(); + std::vector training_data; + for (size_t i = 0; i < num_rows; ++i) + { + std::string_view string_view = concatenated_training_sample_string->getDataAt(i).toView(); + training_data.emplace_back(string_view); + } + + PartialKeyPositions partial_key_positions = chooseBytes(training_data).first; + + ids_for_user[id].partial_key_positions = partial_key_positions; + training_samples.clear(); + } + + const PartialKeyPositions & getPartialKeyPositions(const String & user_name, const String & id) const + { + std::lock_guard lock(mutex); + auto it_user = global_state.find(user_name); + if (it_user == global_state.end()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Id {} not registered for user in entropy learned hashing", id); auto it_id = it_user->second.find(id); if (it_id == it_user->second.end()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Id {} not registered for user in entropy learned hashing", id); - return it_id->second; + return it_id->second.partial_key_positions; } private: mutable std::mutex mutex; - /// Map: user name --> (Map: dataset id --> byte positions to hash) - std::map> partial_key_positions_by_id TSA_GUARDED_BY(mutex); + + /// The state. + struct ColumnsAndPartialKeyPositions + { + /// Caches training data chunks. Filled by prepareTrainEntropyLearnedHash(), cleared by trainEntropyLearnedHash(). + MutableColumns training_samples; + /// The result of the training phase. Filled by trainEntropyLearnedHash(). + PartialKeyPositions partial_key_positions; + }; + + /// Maps a state id to the state. + using IdToColumnsAndPartialKeyPositions = std::map; + + /// Maps the user name to a state id. As a result, the state id is unique at user scope. + using UserNameToId = std::map; + + UserNameToId global_state TSA_GUARDED_BY(mutex); }; } -class FunctionTrainEntropyLearnedHash : public IFunction + +/// Copies all chunks of the training sample column into the global state under a given id. +class FunctionPrepareTrainEntropyLearnedHash : public IFunction { public: - static constexpr auto name = "trainEntropyLearnedHash"; - static FunctionPtr create(ContextPtr context) { return std::make_shared(context->getUserName()); } - explicit FunctionTrainEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {} + static constexpr auto name = "prepareTrainEntropyLearnedHash"; + static FunctionPtr create(ContextPtr context) + { + if(!context->getSettings().allow_experimental_hash_functions) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Entropy-learned hashing is experimental. Set `allow_experimental_hash_functions` setting to enable it"); + + return std::make_shared(context->getUserName()); + } + explicit FunctionPrepareTrainEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {} String getName() const override { return name; } - bool isVariadic() const override { return false; } size_t getNumberOfArguments() const override { return 2; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } bool useDefaultImplementationForConstants() const override { return true; } @@ -182,42 +251,88 @@ public: const ColumnConst * id_col_const = checkAndGetColumn(id_col); const String id = id_col_const->getValue(); - const auto * data_col = arguments[0].column.get(); - if (const ColumnString * col_data_string = checkAndGetColumn(data_col)) - { - const size_t num_rows = col_data_string->size(); + IColumn::Ptr data_col = arguments[0].column; + IColumn::MutablePtr data_col_mutable = IColumn::mutate(data_col); - std::vector training_data; - for (size_t i = 0; i < num_rows; ++i) - { - std::string_view string_view = col_data_string->getDataAt(i).toView(); - training_data.emplace_back(string_view); - } + auto & global_state = EntropyLearnedHashGlobalState::instance(); + global_state.cacheTrainingSample(user_name, id, std::move(data_col_mutable)); - PartialKeyPositions partial_key_positions = chooseBytes(training_data).first; - auto & id_manager = IdManager::instance(); - id_manager.setPartialKeyPositionsForId(user_name, id, partial_key_positions); - - return result_type->createColumnConst(num_rows, 0u)->convertToFullColumnIfConst(); - } - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", - arguments.begin()->column->getName(), getName()); + const size_t num_rows = data_col->size(); + return result_type->createColumnConst(num_rows, 0u); /// dummy output } private: const String user_name; }; +/// 1. Concatenates the training samples of a given id in the global state. +/// 2. Computes the partial key positions from the concatenated training samples and stores that in the global state. +/// 3. clear()-s the training samples in the global state. +class FunctionTrainEntropyLearnedHash : public IFunction +{ +public: + static constexpr auto name = "trainEntropyLearnedHash"; + static FunctionPtr create(ContextPtr context) + { + if(!context->getSettings().allow_experimental_hash_functions) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Entropy-learned hashing is experimental. Set `allow_experimental_hash_functions` setting to enable it"); + return std::make_shared(context->getUserName()); + } + explicit FunctionTrainEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {} + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 1; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + bool useDefaultImplementationForConstants() const override { return false; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + FunctionArgumentDescriptors args{ + {"id", &isString, nullptr, "String"} + }; + + validateFunctionArgumentTypes(*this, arguments, args); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override + { + const IColumn * id_col = arguments[0].column.get(); + const ColumnConst * id_col_const = checkAndGetColumn(id_col); + if (!id_col_const) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", + arguments.begin()->column->getName(), getName()); + + auto & global_state = EntropyLearnedHashGlobalState::instance(); + + const String id = id_col_const->getValue(); + global_state.train(user_name, id); + + const size_t num_rows = id_col->size(); + return result_type->createColumnConst(num_rows, 0u); /// dummy output + } +private: + const String user_name; +}; + + +/// Hashes input strings using partial key positions stored in the global state. class FunctionEntropyLearnedHash : public IFunction { public: static constexpr auto name = "entropyLearnedHash"; - static FunctionPtr create(ContextPtr context) { return std::make_shared(context->getUserName()); } + static FunctionPtr create(ContextPtr context) + { + if(!context->getSettings().allow_experimental_hash_functions) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Entropy-learned hashing experimental. Set `allow_experimental_hash_functions` setting to enable it"); + return std::make_shared(context->getUserName()); + } explicit FunctionEntropyLearnedHash(const String & user_name_) : IFunction(), user_name(user_name_) {} String getName() const override { return name; } - bool isVariadic() const override { return false; } size_t getNumberOfArguments() const override { return 2; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } bool useDefaultImplementationForConstants() const override { return true; } @@ -242,8 +357,8 @@ public: const ColumnConst * id_col_const = checkAndGetColumn(id_col); const String id = id_col_const->getValue(); - const auto & id_manager = IdManager::instance(); - const auto & partial_key_positions = id_manager.getPartialKeyPositionsForId(user_name, id); + const auto & global_state = EntropyLearnedHashGlobalState::instance(); + const auto & partial_key_positions = global_state.getPartialKeyPositions(user_name, id); const auto * data_col = arguments[0].column.get(); if (const auto * col_data_string = checkAndGetColumn(data_col)) @@ -272,6 +387,7 @@ private: REGISTER_FUNCTION(EntropyLearnedHash) { + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); } diff --git a/tests/queries/0_stateless/02734_entropy_learned_hashing.reference b/tests/queries/0_stateless/02734_entropy_learned_hashing.reference index 2240da7fea4..f558e3cd444 100644 --- a/tests/queries/0_stateless/02734_entropy_learned_hashing.reference +++ b/tests/queries/0_stateless/02734_entropy_learned_hashing.reference @@ -1,12 +1,18 @@ 0 0 0 +0 +0 +0 2603192927274642682 4947675599669400333 10783339242466472992 0 0 0 +0 +0 +0 2603192927274642682 4947675599669400333 10783339242466472992 diff --git a/tests/queries/0_stateless/02734_entropy_learned_hashing.sql b/tests/queries/0_stateless/02734_entropy_learned_hashing.sql index c0c07b26938..ae829fa03c3 100644 --- a/tests/queries/0_stateless/02734_entropy_learned_hashing.sql +++ b/tests/queries/0_stateless/02734_entropy_learned_hashing.sql @@ -1,21 +1,30 @@ +-- Tags: no-parallel +-- no-parallel because entropy-learned hash uses global state + +SET allow_experimental_hash_functions = 1; + +-- no commonalities between keys DROP TABLE IF EXISTS tbl1; CREATE TABLE tbl1 (x String) ENGINE=Memory; --- no commonalities between keys INSERT INTO tbl1 VALUES ('a'), ('b'), ('c'); -SELECT trainEntropyLearnedHash(x, 'id1') FROM tbl1; +SELECT prepareTrainEntropyLearnedHash(x, 'id1') FROM tbl1; +SELECT trainEntropyLearnedHash('id1') FROM tbl1; SELECT entropyLearnedHash(x, 'id1') FROM tbl1; -SELECT trainEntropyLearnedHash(x, 1) FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } --- SELECT trainEntropyLearnedHash(x, NULL) FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -SELECT trainEntropyLearnedHash(1, 'id1') FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -SELECT entropyLearnedHash(x, 'non-existing id') FROM tbl1; -- { serverError BAD_ARGUMENTS } - +-- with commonalities between keys DROP TABLE IF EXISTS tbl2; CREATE TABLE tbl2 (x String) ENGINE=Memory; --- with commonalities between keys INSERT INTO tbl2 VALUES ('aa'), ('ba'), ('ca'); -SELECT trainEntropyLearnedHash(x, 'id2') FROM tbl2; +SELECT prepareTrainEntropyLearnedHash(x, 'id2') FROM tbl2; +SELECT trainEntropyLearnedHash('id2') FROM tbl2; SELECT entropyLearnedHash(x, 'id2') FROM tbl2; +-- negative tests +SELECT prepareTrainEntropyLearnedHash(x, 1) FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT prepareTrainEntropyLearnedHash(1, 'id1') FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT trainEntropyLearnedHash(1) FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT entropyLearnedHash(1, 'id1') FROM tbl1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT entropyLearnedHash(x, 'non-existing id') FROM tbl1; -- { serverError BAD_ARGUMENTS } + DROP TABLE tbl1; DROP TABLE tbl2; From ac1dc7237ec9301c94ac167303bd4e0b80b3114c Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 26 May 2023 15:37:33 +0000 Subject: [PATCH 21/30] Update docs --- docs/en/sql-reference/functions/hash-functions.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 91c45f90018..663b8468a94 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -578,7 +578,7 @@ Entropy-learned hashing has two phases: - Function `prepareTrainEntropyLearnedHash(data, id)` caches the training data in a global state under a given `id`. It returns dummy value `0` on every row. - Function `trainEntropyLearnedHash(id)` computes a minimal partial sub-key of the training data stored stored under `id` in the global - state. The result is stored in the global state as well. It returns dummy value `0` on every row. + state. The cached training data in the global state is replaced by the partial key. Dummy value `0` is returned on every row. 2. An evaluation phase where hashes are computed using the previously calculated partial sub-keys. Function `entropyLearnedHash(data, id)` hashes `data` using the partial subkey stored as `id`. CityHash64 is used as hash function. @@ -586,6 +586,9 @@ Entropy-learned hashing has two phases: The reason that the training phase comprises two steps is that ClickHouse processes data at chunk granularity but entropy-learned hashing needs to process the entire training set at once. +Since functions `prepareTrainEntropyLearnedHash()` and `trainEntropyLearnedHash()` access global state, they should not be called in +parallel with the same `id`. + **Syntax** ``` sql @@ -597,6 +600,7 @@ entropyLearnedHash(data, id); **Example** ```sql +SET allow_experimental_hash_functions=1; CREATE TABLE tab (col String) ENGINE=Memory; INSERT INTO tab VALUES ('aa'), ('ba'), ('ca'); From 503105d4b1f1fb66c7f440478a8e4492dd66280d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 29 May 2023 08:07:46 +0000 Subject: [PATCH 22/30] Fix 02415_all_new_functions_must_be_documented --- .../02415_all_new_functions_must_be_documented.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index f2aed15e367..b2c9073648e 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -559,6 +559,7 @@ positionCaseInsensitive positionCaseInsensitiveUTF8 positionUTF8 pow +prepareTrainEntropyLearnedHash proportionsZTest protocol queryID From 41d60f0be3209663fa8659fdb22fa7ea042fa5fe Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 29 May 2023 08:08:47 +0000 Subject: [PATCH 23/30] Fix style --- src/Functions/EntropyLearnedHash.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Functions/EntropyLearnedHash.cpp b/src/Functions/EntropyLearnedHash.cpp index a80a8fe0228..854379bbb9d 100644 --- a/src/Functions/EntropyLearnedHash.cpp +++ b/src/Functions/EntropyLearnedHash.cpp @@ -219,7 +219,7 @@ public: static constexpr auto name = "prepareTrainEntropyLearnedHash"; static FunctionPtr create(ContextPtr context) { - if(!context->getSettings().allow_experimental_hash_functions) + if (!context->getSettings().allow_experimental_hash_functions) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Entropy-learned hashing is experimental. Set `allow_experimental_hash_functions` setting to enable it"); @@ -274,7 +274,7 @@ public: static constexpr auto name = "trainEntropyLearnedHash"; static FunctionPtr create(ContextPtr context) { - if(!context->getSettings().allow_experimental_hash_functions) + if (!context->getSettings().allow_experimental_hash_functions) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Entropy-learned hashing is experimental. Set `allow_experimental_hash_functions` setting to enable it"); return std::make_shared(context->getUserName()); @@ -325,7 +325,7 @@ public: static constexpr auto name = "entropyLearnedHash"; static FunctionPtr create(ContextPtr context) { - if(!context->getSettings().allow_experimental_hash_functions) + if (!context->getSettings().allow_experimental_hash_functions) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Entropy-learned hashing experimental. Set `allow_experimental_hash_functions` setting to enable it"); return std::make_shared(context->getUserName()); From 91a3c881267d1d725db78da4ec8ed5d53ae230fb Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 30 May 2023 15:49:47 +0200 Subject: [PATCH 24/30] less logs in WriteBufferFromS3 --- src/IO/WriteBufferFromS3.cpp | 8 -------- src/IO/WriteBufferFromS3TaskTracker.cpp | 11 ----------- 2 files changed, 19 deletions(-) diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 954c996d929..ffdc23b274b 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -195,18 +195,14 @@ void WriteBufferFromS3::finalizeImpl() if (request_settings.check_objects_after_upload) { - LOG_TRACE(log, "Checking object {} exists after upload", key); S3::checkObjectExists(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage, "Immediately after upload"); - LOG_TRACE(log, "Checking object {} has size as expected {}", key, total_size); size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage); if (actual_size != total_size) throw Exception( ErrorCodes::S3_ERROR, "Object {} from bucket {} has unexpected size {} after upload, expected size {}, it's a bug in S3 or S3 API.", key, bucket, actual_size, total_size); - - LOG_TRACE(log, "Object {} exists after upload", key); } } @@ -286,8 +282,6 @@ void WriteBufferFromS3::reallocateFirstBuffer() WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size); chassert(offset() == 0); - - LOG_TRACE(log, "Reallocated first buffer with size {}. {}", memory.size(), getLogDetails()); } void WriteBufferFromS3::detachBuffer() @@ -310,8 +304,6 @@ void WriteBufferFromS3::allocateFirstBuffer() const auto size = std::min(size_t(DBMS_DEFAULT_BUFFER_SIZE), max_first_buffer); memory = Memory(size); WriteBuffer::set(memory.data(), memory.size()); - - LOG_TRACE(log, "Allocated first buffer with size {}. {}", memory.size(), getLogDetails()); } void WriteBufferFromS3::allocateBuffer() diff --git a/src/IO/WriteBufferFromS3TaskTracker.cpp b/src/IO/WriteBufferFromS3TaskTracker.cpp index 4abae90eeac..b023de16c98 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.cpp +++ b/src/IO/WriteBufferFromS3TaskTracker.cpp @@ -36,8 +36,6 @@ ThreadPoolCallbackRunner WriteBufferFromS3::TaskTracker::syncRunner() void WriteBufferFromS3::TaskTracker::waitAll() { - LOG_TEST(log, "waitAll, in queue {}", futures.size()); - /// Exceptions are propagated for (auto & future : futures) { @@ -51,8 +49,6 @@ void WriteBufferFromS3::TaskTracker::waitAll() void WriteBufferFromS3::TaskTracker::safeWaitAll() { - LOG_TEST(log, "safeWaitAll, wait in queue {}", futures.size()); - for (auto & future : futures) { if (future.valid()) @@ -76,7 +72,6 @@ void WriteBufferFromS3::TaskTracker::safeWaitAll() void WriteBufferFromS3::TaskTracker::waitIfAny() { - LOG_TEST(log, "waitIfAny, in queue {}", futures.size()); if (futures.empty()) return; @@ -101,8 +96,6 @@ void WriteBufferFromS3::TaskTracker::waitIfAny() watch.stop(); ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds()); - - LOG_TEST(log, "waitIfAny ended, in queue {}", futures.size()); } void WriteBufferFromS3::TaskTracker::add(Callback && func) @@ -147,8 +140,6 @@ void WriteBufferFromS3::TaskTracker::waitTilInflightShrink() if (!max_tasks_inflight) return; - LOG_TEST(log, "waitTilInflightShrink, in queue {}", futures.size()); - Stopwatch watch; /// Alternative approach is to wait until at least futures.size() - max_tasks_inflight element are finished @@ -178,8 +169,6 @@ void WriteBufferFromS3::TaskTracker::waitTilInflightShrink() watch.stop(); ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds()); - - LOG_TEST(log, "waitTilInflightShrink ended, in queue {}", futures.size()); } bool WriteBufferFromS3::TaskTracker::isAsync() const From 060d0118d743a1c77e3c2ac692d6fc8c81c61900 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 30 May 2023 12:08:24 -0300 Subject: [PATCH 25/30] Update merge-tree-settings.md --- docs/en/operations/settings/merge-tree-settings.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md index 0b1207ee7b6..35f9e4b29a7 100644 --- a/docs/en/operations/settings/merge-tree-settings.md +++ b/docs/en/operations/settings/merge-tree-settings.md @@ -577,7 +577,7 @@ Default value: 20 **Usage** -The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_pool_size) * [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception. +The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_pool_size) * [background_merges_mutations_concurrency_ratio](/docs/en/operations/server-configuration-parameters/settings#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception. ## max_part_loading_threads {#max-part-loading-threads} @@ -840,4 +840,4 @@ Possible values: - `Always` or `Never`. -Default value: `Never` \ No newline at end of file +Default value: `Never` From cfa3fbf5b8718692f06aaacce6a5dab3ee38da87 Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Tue, 30 May 2023 12:06:27 -0400 Subject: [PATCH 26/30] add .md extensions --- docs/en/operations/settings/merge-tree-settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md index 35f9e4b29a7..4122b4af40f 100644 --- a/docs/en/operations/settings/merge-tree-settings.md +++ b/docs/en/operations/settings/merge-tree-settings.md @@ -577,7 +577,7 @@ Default value: 20 **Usage** -The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_pool_size) * [background_merges_mutations_concurrency_ratio](/docs/en/operations/server-configuration-parameters/settings#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception. +The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings.md/#background_pool_size) * [background_merges_mutations_concurrency_ratio](/docs/en/operations/server-configuration-parameters/settings.md/#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception. ## max_part_loading_threads {#max-part-loading-threads} From 8d6244ed991e30129a5f66aea85842a2c45d8520 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 30 May 2023 17:10:10 +0000 Subject: [PATCH 27/30] Docs: Explain non-production / advanced install options in more detail Fixes: #49878 --- docs/en/getting-started/install.md | 41 +++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md index 3f6c2577c94..74d712d9f5e 100644 --- a/docs/en/getting-started/install.md +++ b/docs/en/getting-started/install.md @@ -28,21 +28,23 @@ The quickest and easiest way to get up and running with ClickHouse is to create For production installs of a specific release version see the [installation options](#available-installation-options) down below. ::: -On Linux and macOS: +On Linux, macOS and FreeBSD: -1. If you are just getting started and want to see what ClickHouse can do, the simplest way to download ClickHouse locally is to run the following command. It downloads a single binary for your operating system that can be used to run the ClickHouse server, clickhouse-client, clickhouse-local, -ClickHouse Keeper, and other tools: +1. If you are just getting started and want to see what ClickHouse can do, the simplest way to download ClickHouse locally is to run the + following command. It downloads a single binary for your operating system that can be used to run the ClickHouse server, + clickhouse-client, clickhouse-local, ClickHouse Keeper, and other tools: ```bash curl https://clickhouse.com/ | sh ``` 1. Run the following command to start the ClickHouse server: + ```bash ./clickhouse server ``` - The first time you run this script, the necessary files and folders are created in the current directory, then the server starts. + The first time you run this script, the necessary files and folders are created in the current directory, then the server starts. 1. Open a new terminal and use the **clickhouse-client** to connect to your service: @@ -330,7 +332,9 @@ For production environments, it’s recommended to use the latest `stable`-versi To run ClickHouse inside Docker follow the guide on [Docker Hub](https://hub.docker.com/r/clickhouse/clickhouse-server/). Those images use official `deb` packages inside. -### From Sources {#from-sources} +## Non-Production Deployments (Advanced) + +### Compile From Source {#from-sources} To manually compile ClickHouse, follow the instructions for [Linux](/docs/en/development/build.md) or [macOS](/docs/en/development/build-osx.md). @@ -346,8 +350,31 @@ You’ll need to create data and metadata folders manually and `chown` them for On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sources. -### From CI checks pre-built binaries -ClickHouse binaries are built for each [commit](/docs/en/development/build.md#you-dont-have-to-build-clickhouse). +### Install a CI-generated Binary + +ClickHouse's continuous integration (CI) infrastructure produces specialized builds for each commit, e.g. +[sanitized](https://github.com/google/sanitizers) builds, unoptimized (Debug) builds, cross-compiled builds etc. While such builds are +normally only useful during development, they can in certain situations also be interesting for users. + +:::note +Since ClickHouse's CI is evolving over time, the exact steps to download CI-generated builds may vary. +Also, CI may delete too old build artifacts, making them unavailable for download. +::: + +For example, to download a aarch64 binary for ClickHouse v23.4, follow these steps: + +- Find the GitHub pull request for release v23.4: [Release pull request for branch 23.4](https://github.com/ClickHouse/ClickHouse/pull/49238) +- Click "Commits", then click a commit similar to "Update autogenerated version to 23.4.2.1 and contributors" for the particular version you like to install. +- Click the red cross to open the list of CI checks. +- Click "Details" next to "ClickHouse Build Check" in the list, it will open a page similar to [this page](https://s3.amazonaws.com/clickhouse-test-reports/46793/b460eb70bf29b19eadd19a1f959b15d186705394/clickhouse_build_check/report.html) +- Find the rows with compiler = "clang-*-aarch64" - there are multiple rows. +- Download the artifacts for these builds. + +To download binaries for very old x86-64 systems without [SSE3](https://en.wikipedia.org/wiki/SSE3) support or old ARM systems without +[ARMv8.1-A](https://en.wikipedia.org/wiki/AArch64#ARMv8.1-A) support, open a GitHub pull request and find CI check "BuilderBinAmd64Compat", +respectively "BuilderBinAarch64V80Compat". Then click "Details", open the "Build" fold, scroll to the end, find message "Notice: Build URLs +https://s3.amazonaws.com/clickhouse/builds/PRs/.../.../binary_aarch64_v80compat/clickhouse". You can then click the link to download the +build. ## Launch {#launch} From e0759eff97912fb7b20a70b8a45ce39944711725 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 30 May 2023 17:28:48 +0000 Subject: [PATCH 28/30] Remove install scripts superseded by universal.sh --- docs/_includes/install/arm.sh | 6 ------ docs/_includes/install/freebsd.sh | 3 --- docs/_includes/install/mac-arm.sh | 3 --- docs/_includes/install/mac-x86.sh | 3 --- 4 files changed, 15 deletions(-) delete mode 100644 docs/_includes/install/arm.sh delete mode 100644 docs/_includes/install/freebsd.sh delete mode 100644 docs/_includes/install/mac-arm.sh delete mode 100644 docs/_includes/install/mac-x86.sh diff --git a/docs/_includes/install/arm.sh b/docs/_includes/install/arm.sh deleted file mode 100644 index c6b1f7d12f8..00000000000 --- a/docs/_includes/install/arm.sh +++ /dev/null @@ -1,6 +0,0 @@ -# ARM (AArch64) build works on Amazon Graviton, Oracle Cloud, Huawei Cloud ARM machines. -# The support for AArch64 is pre-production ready. - -wget 'https://builds.clickhouse.com/master/aarch64/clickhouse' -chmod a+x ./clickhouse -sudo ./clickhouse install diff --git a/docs/_includes/install/freebsd.sh b/docs/_includes/install/freebsd.sh deleted file mode 100644 index 2a715a1795f..00000000000 --- a/docs/_includes/install/freebsd.sh +++ /dev/null @@ -1,3 +0,0 @@ -fetch 'https://builds.clickhouse.com/master/freebsd/clickhouse' -chmod a+x ./clickhouse -su -m root -c './clickhouse install' diff --git a/docs/_includes/install/mac-arm.sh b/docs/_includes/install/mac-arm.sh deleted file mode 100644 index 5863d5b090d..00000000000 --- a/docs/_includes/install/mac-arm.sh +++ /dev/null @@ -1,3 +0,0 @@ -wget 'https://builds.clickhouse.com/master/macos-aarch64/clickhouse' -chmod a+x ./clickhouse -./clickhouse diff --git a/docs/_includes/install/mac-x86.sh b/docs/_includes/install/mac-x86.sh deleted file mode 100644 index 2a216b534b4..00000000000 --- a/docs/_includes/install/mac-x86.sh +++ /dev/null @@ -1,3 +0,0 @@ -wget 'https://builds.clickhouse.com/master/macos/clickhouse' -chmod a+x ./clickhouse -./clickhouse From d7002f8b94193f4a9c0dd1fcb154c55d63151d7f Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 30 May 2023 17:46:21 +0000 Subject: [PATCH 29/30] Incorporate Dan's feedback --- docs/en/development/build.md | 17 +---------------- docs/en/getting-started/install.md | 16 +++++++++------- .../aspell-ignore/en/aspell-dict.txt | 6 ++++++ 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/docs/en/development/build.md b/docs/en/development/build.md index c35e077a166..83a4550df88 100644 --- a/docs/en/development/build.md +++ b/docs/en/development/build.md @@ -43,7 +43,7 @@ sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test For other Linux distribution - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html). As of April 2023, any version of Clang >= 15 will work. -GCC as a compiler is not supported +GCC as a compiler is not supported. To build with a specific Clang version: :::tip @@ -114,18 +114,3 @@ mkdir build cmake -S . -B build cmake --build build ``` - -## You Don’t Have to Build ClickHouse {#you-dont-have-to-build-clickhouse} - -ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour. - -The CI checks build the binaries on each commit to [ClickHouse](https://github.com/clickhouse/clickhouse/). To download them: - -1. Open the [commits list](https://github.com/ClickHouse/ClickHouse/commits/master) -1. Choose a **Merge pull request** commit that includes the new feature, or was added after the new feature -1. Click the status symbol (yellow dot, red x, green check) to open the CI check list -1. Scroll through the list until you find **ClickHouse build check x/x artifact groups are OK** -1. Click **Details** -1. Find the type of package for your operating system that you need and download the files. - -![build artifact check](images/find-build-artifact.png) diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md index 74d712d9f5e..d44dc861888 100644 --- a/docs/en/getting-started/install.md +++ b/docs/en/getting-started/install.md @@ -46,7 +46,7 @@ On Linux, macOS and FreeBSD: The first time you run this script, the necessary files and folders are created in the current directory, then the server starts. -1. Open a new terminal and use the **clickhouse-client** to connect to your service: +1. Open a new terminal and use the **./clickhouse client** to connect to your service: ```bash ./clickhouse client @@ -352,9 +352,10 @@ On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sourc ### Install a CI-generated Binary -ClickHouse's continuous integration (CI) infrastructure produces specialized builds for each commit, e.g. -[sanitized](https://github.com/google/sanitizers) builds, unoptimized (Debug) builds, cross-compiled builds etc. While such builds are -normally only useful during development, they can in certain situations also be interesting for users. +ClickHouse's continuous integration (CI) infrastructure produces specialized builds for each commit in the [ClickHouse +repository](https://github.com/clickhouse/clickhouse/), e.g. [sanitized](https://github.com/google/sanitizers) builds, unoptimized (Debug) +builds, cross-compiled builds etc. While such builds are normally only useful during development, they can in certain situations also be +interesting for users. :::note Since ClickHouse's CI is evolving over time, the exact steps to download CI-generated builds may vary. @@ -365,14 +366,15 @@ For example, to download a aarch64 binary for ClickHouse v23.4, follow these ste - Find the GitHub pull request for release v23.4: [Release pull request for branch 23.4](https://github.com/ClickHouse/ClickHouse/pull/49238) - Click "Commits", then click a commit similar to "Update autogenerated version to 23.4.2.1 and contributors" for the particular version you like to install. -- Click the red cross to open the list of CI checks. +- Click the green check / yellow dot / red cross to open the list of CI checks. - Click "Details" next to "ClickHouse Build Check" in the list, it will open a page similar to [this page](https://s3.amazonaws.com/clickhouse-test-reports/46793/b460eb70bf29b19eadd19a1f959b15d186705394/clickhouse_build_check/report.html) - Find the rows with compiler = "clang-*-aarch64" - there are multiple rows. - Download the artifacts for these builds. To download binaries for very old x86-64 systems without [SSE3](https://en.wikipedia.org/wiki/SSE3) support or old ARM systems without -[ARMv8.1-A](https://en.wikipedia.org/wiki/AArch64#ARMv8.1-A) support, open a GitHub pull request and find CI check "BuilderBinAmd64Compat", -respectively "BuilderBinAarch64V80Compat". Then click "Details", open the "Build" fold, scroll to the end, find message "Notice: Build URLs +[ARMv8.1-A](https://en.wikipedia.org/wiki/AArch64#ARMv8.1-A) support, open a [pull +request](https://github.com/ClickHouse/ClickHouse/commits/master) and find CI check "BuilderBinAmd64Compat", respectively +"BuilderBinAarch64V80Compat". Then click "Details", open the "Build" fold, scroll to the end, find message "Notice: Build URLs https://s3.amazonaws.com/clickhouse/builds/PRs/.../.../binary_aarch64_v80compat/clickhouse". You can then click the link to download the build. diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 65844a4946e..ded7a4643a9 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2,6 +2,7 @@ personal_ws-1.1 en 543 AArch ACLs AMQP +ARMv ASLR ASan Actian @@ -12,6 +13,8 @@ AvroConfluent BSON BSONEachRow Bool +BuilderBinAarch +BuilderBinAmd CCTOOLS CLion CMake @@ -27,6 +30,7 @@ ClickHouse's ClickableSquare CodeBlock CodeLLDB +Compat Config ConnectionDetails Contrib @@ -213,6 +217,7 @@ anonymized ansi async atomicity +autogenerated autogeneration autostart avro @@ -539,6 +544,7 @@ uint unary unencrypted unixodbc +unoptimized url userspace userver From a01e8644c5cad786cc9b1a34a2e1e1faf93d9399 Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Tue, 30 May 2023 21:09:56 -0400 Subject: [PATCH 30/30] Settings update (#50350) * sync system.server_settings with doc * typos * update DNS cache setting * add server settings from system tbl * catch up * catch up * Trigger Build --------- Co-authored-by: Nikita Mikhaylov --- .../settings.md | 1061 ++++++++++------- 1 file changed, 631 insertions(+), 430 deletions(-) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 7ebf387f2ed..82be1c10dcc 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -7,6 +7,600 @@ description: This section contains descriptions of server settings that cannot b # Server Settings +## allow_use_jemalloc_memory + +Allows to use jemalloc memory. + +Type: Bool + +Default: 1 + + +## asynchronous_heavy_metrics_update_period_s + +Period in seconds for updating asynchronous metrics. + +Type: UInt32 + +Default: 120 + + +## asynchronous_metrics_update_period_s + +Period in seconds for updating asynchronous metrics. + +Type: UInt32 + +Default: 1 + + +## background_buffer_flush_schedule_pool_size + +The maximum number of threads that will be used for performing flush operations for Buffer-engine tables in the background. + +Type: UInt64 + +Default: 16 + + +## background_common_pool_size + +The maximum number of threads that will be used for performing a variety of operations (mostly garbage collection) for \*MergeTree-engine tables in a background. + +Type: UInt64 + +Default: 8 + +## background_distributed_schedule_pool_size + +The maximum number of threads that will be used for executing distributed sends. + +Type: UInt64 + +Default: 16 + + +## background_fetches_pool_size + +The maximum number of threads that will be used for fetching data parts from another replica for \*MergeTree-engine tables in a background. + +Type: UInt64 + +Default: 8 + +## background_merges_mutations_concurrency_ratio + +Sets a ratio between the number of threads and the number of background merges and mutations that can be executed concurrently. For example, if the ratio equals to 2 and +`background_pool_size` is set to 16 then ClickHouse can execute 32 background merges concurrently. This is possible, because background operations could be suspended and postponed. This is needed to give small merges more execution priority. You can only increase this ratio at runtime. To lower it you have to restart the server. +The same as for `background_pool_size` setting `background_merges_mutations_concurrency_ratio` could be applied from the `default` profile for backward compatibility. + +Type: Float + +Default: 2 + +## background_merges_mutations_scheduling_policy + +The policy on how to perform a scheduling for background merges and mutations. Possible values are: `round_robin` and `shortest_task_first`. + +## background_merges_mutations_scheduling_policy + +Algorithm used to select next merge or mutation to be executed by background thread pool. Policy may be changed at runtime without server restart. +Could be applied from the `default` profile for backward compatibility. + +Possible values: + +- "round_robin" — Every concurrent merge and mutation is executed in round-robin order to ensure starvation-free operation. Smaller merges are completed faster than bigger ones just because they have fewer blocks to merge. +- "shortest_task_first" — Always execute smaller merge or mutation. Merges and mutations are assigned priorities based on their resulting size. Merges with smaller sizes are strictly preferred over bigger ones. This policy ensures the fastest possible merge of small parts but can lead to indefinite starvation of big merges in partitions heavily overloaded by INSERTs. + +Type: String + +Default: round_robin + +## background_message_broker_schedule_pool_size + +The maximum number of threads that will be used for executing background operations for message streaming. + +Type: UInt64 + +Default: 16 + +## background_move_pool_size + +The maximum number of threads that will be used for moving data parts to another disk or volume for *MergeTree-engine tables in a background. + +Type: UInt64 + +Default: 8 + +## background_pool_size + +Sets the number of threads performing background merges and mutations for tables with MergeTree engines. You can only increase the number of threads at runtime. To lower the number of threads you have to restart the server. By adjusting this setting, you manage CPU and disk load. Smaller pool size utilizes less CPU and disk resources, but background processes advance slower which might eventually impact query performance. + +Before changing it, please also take a look at related MergeTree settings, such as `number_of_free_entries_in_pool_to_lower_max_size_of_merge` and `number_of_free_entries_in_pool_to_execute_mutation`. + +Type: UInt64 + +Default: 16 + +## background_schedule_pool_size + +The maximum number of threads that will be used for constantly executing some lightweight periodic operations for replicated tables, Kafka streaming, and DNS cache updates. + +Type: UInt64 + +Default: 128 + +## backup_threads + +The maximum number of threads to execute BACKUP requests. + +Type: UInt64 + +Default: 16 + + +## backups_io_thread_pool_queue_size + +The maximum number of jobs that can be scheduled on the Backups IO Thread pool. It is recommended to keep this queue unlimited (0) due to the current S3 backup logic. + +Type: UInt64 + +Default: 0 + +## cache_size_to_ram_max_ratio + +Set cache size to RAM max ratio. Allows lowering the cache size on low-memory systems. + +Type: Double + +Default: 0.5 + +## concurrent_threads_soft_limit_num + +The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries. This is not a hard limit. In case if the limit is reached the query will still get at least one thread to run. Query can upscale to desired number of threads during execution if more threads become available. + +Zero means Unlimited. + +Type: UInt64 + +Default: 0 + +## concurrent_threads_soft_limit_ratio_to_cores + +Same as concurrent_threads_soft_limit_num, but with ratio to cores. + +Type: UInt64 + +Default: 0 + + +## default_database + +Default database name. + +Type: String + +Default: default + +## disable_internal_dns_cache + +Disables the internal DNS cache. Recommended for operating ClickHouse in systems +with frequently changing infrastructure such as Kubernetes. + +Type: Bool + +Default: 0 + +## dns_cache_update_period + +Internal DNS cache update period in seconds. + +Type: Int32 + +Default: 15 + + +## dns_max_consecutive_failures + +Max connection failures before dropping host from ClickHouse DNS cache + +Type: UInt32 + +Default: 1024 + + +## index_mark_cache_size + +Size of cache for index marks. Zero means disabled. + +Type: UInt64 + +Default: 0 + + +## index_uncompressed_cache_size + +Size of cache for uncompressed blocks of MergeTree indices. Zero means disabled. + +Type: UInt64 + +Default: 0 + + +## io_thread_pool_queue_size + +Queue size for IO thread pool. Zero means unlimited. + +Type: UInt64 + +Default: 10000 + +## mark_cache_policy + +Mark cache policy name. + +Type: String + +Default: SLRU + +## mark_cache_size + +Size of cache for marks (index of MergeTree family of tables). + +Type: UInt64 + +Default: 5368709120 + +## max_backup_bandwidth_for_server + +The maximum read speed in bytes per second for all backups on server. Zero means unlimited. + +Type: UInt64 + +Default: 0 + +## max_backups_io_thread_pool_free_size + +If the number of **idle** threads in the Backups IO Thread pool exceeds `max_backup_io_thread_pool_free_size`, ClickHouse will release resources occupied by idling threads and decrease the pool size. Threads can be created again if necessary. + +Type: UInt64 + +Default: 0 + +## max_backups_io_thread_pool_size + +The maximum number of threads that would be used for IO operations for BACKUP queries + +Type: UInt64 + +Default: 1000 + +## max_concurrent_insert_queries + +Limit on total number of concurrent insert queries. Zero means Unlimited. + +:::note +These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +::: + +Type: UInt64 + +Default: 0 + +## max_concurrent_queries + +Limit on total number of concurrently executed queries. Zero means Unlimited. Note that limits on insert and select queries, and on the maximum number of queries for users must also be considered. See also max_concurrent_insert_queries, max_concurrent_select_queries, max_concurrent_queries_for_all_users. Zero means unlimited. + +:::note +These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +::: + +Type: UInt64 + +Default: 0 + +## max_concurrent_select_queries + +Limit on total number of concurrently select queries. Zero means Unlimited. + +:::note +These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +::: + +Type: UInt64 + +Default: 0 + +## max_connections + +Max server connections. + +Type: Int32 + +Default: 1024 + +## max_io_thread_pool_free_size + +Max free size for IO thread pool. + +Type: UInt64 + +Default: 0 + +## max_io_thread_pool_size + +The maximum number of threads that would be used for IO operations + +Type: UInt64 + +Default: 100 + +## max_local_read_bandwidth_for_server + +The maximum speed of local reads in bytes per second. Zero means unlimited. + +Type: UInt64 + +Default: 0 + +## max_local_write_bandwidth_for_server + +The maximum speed of local writes in bytes per second. Zero means unlimited. + +Type: UInt64 + +Default: 0 + +## max_partition_size_to_drop + +Restriction on dropping partitions. + +If the size of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `max_partition_size_to_drop` (in bytes), you can’t drop a partition using a [DROP PARTITION](../../sql-reference/statements/alter/partition.md#drop-partitionpart) query. +This setting does not require a restart of the Clickhouse server to apply. Another way to disable the restriction is to create the `/flags/force_drop_table` file. +Default value: 50 GB. +The value 0 means that you can drop partitions without any restrictions. + +:::note +This limitation does not restrict drop table and truncate table, see [max_table_size_to_drop](#max-table-size-to-drop) +::: + +## max_remote_read_network_bandwidth_for_server + +The maximum speed of data exchange over the network in bytes per second for read. Zero means unlimited. + +Type: UInt64 + +Default: 0 + +## max_remote_write_network_bandwidth_for_server + +The maximum speed of data exchange over the network in bytes per second for write. Zero means unlimited. + +Type: UInt64 + +Default: 0 + +## max_server_memory_usage + +Limit on total memory usage. Zero means Unlimited. + +The default `max_server_memory_usage` value is calculated as `memory_amount * max_server_memory_usage_to_ram_ratio`. + +Type: UInt64 + +Default: 0 + +## max_server_memory_usage_to_ram_ratio + +Same as max_server_memory_usage but in a ratio to physical RAM. Allows lowering the memory usage on low-memory systems. Zero means unlimited. + +On hosts with low RAM and swap, you possibly need setting `max_server_memory_usage_to_ram_ratio` larger than 1. + +Type: Double + +Default: 0.9 + +## max_table_size_to_drop + +Restriction on deleting tables. + +If the size of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `max_table_size_to_drop` (in bytes), you can’t delete it using a [DROP](../../sql-reference/statements/drop.md) query or [TRUNCATE](../../sql-reference/statements/truncate.md) query. + +This setting does not require a restart of the Clickhouse server to apply. Another way to disable the restriction is to create the `/flags/force_drop_table` file. + +Default value: 50 GB. +The value 0 means that you can delete all tables without any restrictions. +**Example** +``` xml +0 +``` + +## max_temporary_data_on_disk_size + +The maximum amount of storage that could be used for external aggregation, joins or sorting. +Queries that exceed this limit will fail with an exception. Zero means unlimited. + +Also see `max_temporary_data_on_disk_size_for_user` and `max_temporary_data_on_disk_size_for_query`. + +Type: UInt64 + +Default: 0 + +## max_thread_pool_free_size + +If the number of **idle** threads in the Global Thread pool is greater than `max_thread_pool_free_size`, then ClickHouse releases resources occupied by some threads and the pool size is decreased. Threads can be created again if necessary. + +Type: UInt64 + +Default: 1000 + +## max_thread_pool_size + +The maximum number of threads that could be allocated from the OS and used for query execution and background operations. + +Type: UInt64 + +Default: 10000 + +## mmap_cache_size + +Sets the cache size (in bytes) for mapped files. This setting allows avoiding frequent open/close calls (which are very expensive due to consequent page faults), and to reuse mappings from several threads and queries. The setting value is the number of mapped regions (usually equal to the number of mapped files). The amount of data in mapped files can be monitored in the tables system.metrics and system.metric_log with the `MMappedFiles` and `MMappedFileBytes` metrics. Also, in system.asynchronous_metrics and system.asynchronous_metrics_log by the `MMapCacheCells` metric, and in system.events, system.processes, system.query_log, system.query_thread_log, system.query_views_log by the `CreatedReadBufferMMap`, `CreatedReadBufferMMapFailed`, `MMappedFileCacheHits`, `MMappedFileCacheMisses` events. + +Note that the amount of data in mapped files does not consume memory directly and is not accounted for in query or server memory usage — because this memory can be discarded similar to the OS page cache. The cache is dropped (the files are closed) automatically on the removal of old parts in tables of the MergeTree family, also it can be dropped manually by the `SYSTEM DROP MMAP CACHE` query. + +Type: UInt64 + +Default: 1000 + +## restore_threads + +The maximum number of threads to execute RESTORE requests. + +Type: UInt64 + +Default: 16 + +## show_addresses_in_stack_traces + +If it is set true will show addresses in stack traces + +Type: Bool + +Default: 1 + +## shutdown_wait_unfinished_queries + +If set true ClickHouse will wait for running queries finish before shutdown. + +Type: Bool + +Default: 0 + +## temporary_data_in_cache + +With this option, temporary data will be stored in the cache for the particular disk. +In this section, you should specify the disk name with the type `cache`. +In that case, the cache and temporary data will share the same space, and the disk cache can be evicted to create temporary data. + +:::note +Only one option can be used to configure temporary data storage: `tmp_path` ,`tmp_policy`, `temporary_data_in_cache`. +::: + +**Example** + +Both the cache for `local_disk`, and temporary data will be stored in `/tiny_local_cache` on the filesystem, managed by `tiny_local_cache`. + +```xml + + + + + local + /local_disk/ + + + + + cache + local_disk + /tiny_local_cache/ + 10M + 1M + 1 + 0 + + + + + + + tiny_local_cache + + +``` + +Type: String + +Default: + +## thread_pool_queue_size + +TThe maximum number of jobs that can be scheduled on the Global Thread pool. Increasing queue size leads to larger memory usage. It is recommended to keep this value equal to `max_thread_pool_size`. Zero means unlimited. + +Type: UInt64 + +Default: 10000 + +## tmp_policy + +Policy for storage with temporary data. Also see the MergeTree Table Engine documentation. + +:::note +- Only one option can be used to configure temporary data storage: `tmp_path` ,`tmp_policy`, `temporary_data_in_cache`. +- `move_factor`, `keep_free_space_bytes`,`max_data_part_size_bytes` and are ignored. +- Policy should have exactly *one volume* with *local* disks. +::: + +**Example** + +When `/disk1` is full, temporary data will be stored on `/disk2`. + +```xml + + + + + /disk1/ + + + /disk2/ + + + + + + + +
+ disk1 + disk2 +
+
+
+ +
+
+ + + tmp_two_disks + +
+``` +Type: String + +Default: + +## uncompressed_cache_policy + +Uncompressed cache policy name. + +Type: String + +Default: SLRU + + +## uncompressed_cache_size + +Cache size (in bytes) for uncompressed data used by table engines from the MergeTree family. Zero means disabled. + +There is one shared cache for the server. Memory is allocated on demand. The cache is used if the option use_uncompressed_cache is enabled. + +The uncompressed cache is advantageous for very short queries in individual cases. + +Type: UInt64 + +Default: 0 + ## builtin_dictionaries_reload_interval {#builtin-dictionaries-reload-interval} The interval in seconds before reloading built-in dictionaries. @@ -224,18 +818,6 @@ Sets scheduling period of the task. Zero means "never". Default value: `86400` (1 day). -## default_database {#default-database} - -The default database. - -To get a list of databases, use the [SHOW DATABASES](../../sql-reference/statements/show.md#show-databases) query. - -**Example** - -``` xml -default -``` - ## default_profile {#default-profile} Default settings profile. @@ -695,157 +1277,6 @@ For more information, see the section [Creating replicated tables](../../engines ``` -## mark_cache_size {#server-mark-cache-size} - -Approximate size (in bytes) of the cache of marks used by table engines of the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) family. - -The cache is shared for the server and memory is allocated as needed. - -**Example** - -``` xml -5368709120 -``` - -## max_server_memory_usage {#max_server_memory_usage} - -Limits total RAM usage by the ClickHouse server. - -Possible values: - -- Positive integer. -- 0 — Auto. - -Default value: `0`. - -**Additional Info** - -The default `max_server_memory_usage` value is calculated as `memory_amount * max_server_memory_usage_to_ram_ratio`. - -**See also** - -- [max_memory_usage](../../operations/settings/query-complexity.md#settings_max_memory_usage) -- [max_server_memory_usage_to_ram_ratio](#max_server_memory_usage_to_ram_ratio) - -## max_server_memory_usage_to_ram_ratio {#max_server_memory_usage_to_ram_ratio} - -Defines the fraction of total physical RAM amount, available to the ClickHouse server. If the server tries to utilize more, the memory is cut down to the appropriate amount. - -Possible values: - -- Positive double. -- 0 — The ClickHouse server can use all available RAM. - -Default value: `0.9`. - -**Usage** - -On hosts with low RAM and swap, you possibly need setting `max_server_memory_usage_to_ram_ratio` larger than 1. - -**Example** - -``` xml -0.9 -``` - -**See Also** - -- [max_server_memory_usage](#max_server_memory_usage) - -## concurrent_threads_soft_limit_num {#concurrent_threads_soft_limit_num} -The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries. This is not a hard limit. In case if the limit is reached the query will still get at least one thread to run. Query can upscale to desired number of threads during execution if more threads become available. - -Possible values: - -- Positive integer. -- 0 — No limit. - -Default value: `0`. - -**See Also** - -- [Concurrency Control](/docs/en/development/architecture.md#concurrency-control) - -## concurrent_threads_soft_limit_ratio_to_cores {#concurrent_threads_soft_limit_ratio_to_cores} -The maximum number of query processing threads as multiple of number of logical cores. -More details: [concurrent_threads_soft_limit_num](#concurrent_threads_soft_limit_num). - -Possible values: - -- Positive integer. -- 0 — No limit. - -Default value: `0`. - -**Example** - -``` xml -3 -``` - -## max_concurrent_queries {#max-concurrent-queries} - -The maximum number of simultaneously processed queries. -Note that other limits also apply: [max_concurrent_insert_queries](#max-concurrent-insert-queries), [max_concurrent_select_queries](#max-concurrent-select-queries), [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users). - -:::note -These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. -::: - -Possible values: - -- Positive integer. -- 0 — No limit. - -Default value: `100`. - -**Example** - -``` xml -200 -``` - -## max_concurrent_insert_queries {#max-concurrent-insert-queries} - -The maximum number of simultaneously processed `INSERT` queries. - -:::note -These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. -::: - -Possible values: - -- Positive integer. -- 0 — No limit. - -Default value: `0`. - -**Example** - -``` xml -100 -``` - -## max_concurrent_select_queries {#max-concurrent-select-queries} - -The maximum number of simultaneously processed `SELECT` queries. - -:::note -These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. -::: - -Possible values: - -- Positive integer. -- 0 — No limit. - -Default value: `0`. - -**Example** - -``` xml -100 -``` ## max_concurrent_queries_for_user {#max-concurrent-queries-for-user} @@ -889,16 +1320,6 @@ Default value: `0`. - [max_concurrent_queries](#max-concurrent-queries) -## max_connections {#max-connections} - -The maximum number of inbound connections. - -**Example** - -``` xml -4096 -``` - ## max_open_files {#max-open-files} The maximum number of open files. @@ -1128,91 +1549,8 @@ Default value: `0.5`. - [max_memory_usage](../../operations/settings/query-complexity.md#settings_max_memory_usage) - [merges_mutations_memory_usage_soft_limit](#merges_mutations_memory_usage_soft_limit) -## background_merges_mutations_scheduling_policy {#background_merges_mutations_scheduling_policy} -Algorithm used to select next merge or mutation to be executed by background thread pool. Policy may be changed at runtime without server restart. -Could be applied from the `default` profile for backward compatibility. -Possible values: - -- "round_robin" — Every concurrent merge and mutation is executed in round-robin order to ensure starvation-free operation. Smaller merges are completed faster than bigger ones just because they have fewer blocks to merge. -- "shortest_task_first" — Always execute smaller merge or mutation. Merges and mutations are assigned priorities based on their resulting size. Merges with smaller sizes are strictly preferred over bigger ones. This policy ensures the fastest possible merge of small parts but can lead to indefinite starvation of big merges in partitions heavily overloaded by INSERTs. - -Default value: "round_robin". - -**Example** - -```xml -shortest_task_first -``` - -## background_move_pool_size {#background_move_pool_size} - -Sets the number of threads performing background moves for tables with MergeTree engines. Could be increased at runtime and could be applied at server startup from the `default` profile for backward compatibility. - -Possible values: - -- Any positive integer. - -Default value: 8. - -**Example** - -```xml -36 -``` - -## background_fetches_pool_size {#background_fetches_pool_size} - -Sets the number of threads performing background fetches for tables with ReplicatedMergeTree engines. Could be increased at runtime. - -Possible values: - -- Any positive integer. - -Default value: 8. - -**Example** - -```xml -36 -``` - -## background_common_pool_size {#background_common_pool_size} - -Sets the number of threads performing background non-specialized operations like cleaning the filesystem etc. for tables with MergeTree engines. Could be increased at runtime. - -Possible values: - -- Any positive integer. - -Default value: 8. - -**Example** - -```xml -36 -``` - -## background_buffer_flush_schedule_pool_size {#background_buffer_flush_schedule_pool_size} - -Sets the number of threads performing background flush in [Buffer](../../engines/table-engines/special/buffer.md)-engine tables. - -Possible values: - -- Any positive integer. - -Default value: 16. - -## background_schedule_pool_size {#background_schedule_pool_size} - -Sets the number of threads performing background tasks for [replicated](../../engines/table-engines/mergetree-family/replication.md) tables, [Kafka](../../engines/table-engines/integrations/kafka.md) streaming, [DNS cache updates](../../operations/server-configuration-parameters/settings.md/#server-settings-dns-cache-update-period). - -Possible values: - -- Any positive integer. - -Default value: 128. ## merge_tree {#server_configuration_parameters-merge_tree} @@ -1341,14 +1679,14 @@ Queries are logged in the [system.part_log](../../operations/system-tables/part_ Use the following parameters to configure logging: -- `database` - Name of the database. -- `table` - Name of the system table. -- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `database` – Name of the database. +- `table` – Name of the system table. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. - `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined. - `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined. -- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table. -- `storage_policy` - Name of storage policy to use for the table (optional). -- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional). +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. +- `storage_policy` – Name of storage policy to use for the table (optional) +- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree.md/#settings) that control the behavior of the MergeTree (optional). **Example** @@ -1419,14 +1757,14 @@ Queries are logged in the [system.query_log](../../operations/system-tables/quer Use the following parameters to configure logging: -- `database` - Name of the database. -- `table` - Name of the system table the queries will be logged in. -- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `database` – Name of the database. +- `table` – Name of the system table the queries will be logged in. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. - `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined. - `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined. -- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table. -- `storage_policy` - Name of storage policy to use for the table (optional). -- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional). +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. +- `storage_policy` – Name of storage policy to use for the table (optional) +- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree.md/#settings) that control the behavior of the MergeTree (optional). If the table does not exist, ClickHouse will create it. If the structure of the query log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically. @@ -1477,14 +1815,14 @@ Queries are logged in the [system.query_thread_log](../../operations/system-tabl Use the following parameters to configure logging: -- `database` - Name of the database. -- `table` - Name of the system table the queries will be logged in. -- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `database` – Name of the database. +- `table` – Name of the system table the queries will be logged in. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. - `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined. - `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined. -- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table. -- `storage_policy` - Name of storage policy to use for the table (optional). -- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional). +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. +- `storage_policy` – Name of storage policy to use for the table (optional) +- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree.md/#settings) that control the behavior of the MergeTree (optional). If the table does not exist, ClickHouse will create it. If the structure of the query thread log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically. @@ -1507,14 +1845,14 @@ Queries are logged in the [system.query_views_log](../../operations/system-table Use the following parameters to configure logging: -- `database` - Name of the database. -- `table` - Name of the system table the queries will be logged in. -- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `database` – Name of the database. +- `table` – Name of the system table the queries will be logged in. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. - `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined. - `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined. -- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table. -- `storage_policy` - Name of storage policy to use for the table (optional). -- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional). +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. +- `storage_policy` – Name of storage policy to use for the table (optional) +- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree.md/#settings) that control the behavior of the MergeTree (optional). If the table does not exist, ClickHouse will create it. If the structure of the query views log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically. @@ -1535,15 +1873,15 @@ Settings for the [text_log](../../operations/system-tables/text_log.md#system_ta Parameters: -- `level` - Maximum Message Level (by default `Trace`) which will be stored in a table. -- `database` - Database name. -- `table` - Table name. -- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `level` — Maximum Message Level (by default `Trace`) which will be stored in a table. +- `database` — Database name. +- `table` — Table name. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. - `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined. - `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined. -- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table. -- `storage_policy` - Name of storage policy to use for the table (optional). -- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional). +- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table. +- `storage_policy` – Name of storage policy to use for the table (optional) +- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree.md/#settings) that control the behavior of the MergeTree (optional). **Example** ```xml @@ -1566,14 +1904,14 @@ Settings for the [trace_log](../../operations/system-tables/trace_log.md#system_ Parameters: -- `database` - Database for storing a table. -- `table` - Table name. -- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `database` — Database for storing a table. +- `table` — Table name. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. - `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined. - `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/index.md) for a system table. Can't be used if `partition_by` or `order_by` defined. -- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table. -- `storage_policy` - Name of storage policy to use for the table (optional). -- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional). +- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table. +- `storage_policy` – Name of storage policy to use for the table (optional) +- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree.md/#settings) that control the behavior of the MergeTree (optional). The default server configuration file `config.xml` contains the following settings section: @@ -1714,115 +2052,6 @@ Path on the local filesystem to store temporary data for processing large querie /var/lib/clickhouse/tmp/ ``` -## tmp_policy {#tmp-policy} - -Alternatively, a policy from [storage_configuration](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) can be used to store temporary files. - -:::note -- Only one option can be used to configure temporary data storage: `tmp_path` ,`tmp_policy`, `temporary_data_in_cache`. -- `move_factor`, `keep_free_space_bytes`,`max_data_part_size_bytes` and are ignored. -- Policy should have exactly *one volume* with *local* disks. -::: - -**Example** - -```xml - - - - /disk1/ - - - /disk2/ - - - - - - -
- disk1 - disk2 -
-
-
-
-
- - tmp_two_disks -
- -``` - -When `/disk1` is full, temporary data will be stored on `/disk2`. - -## temporary_data_in_cache {#temporary-data-in-cache} - -With this option, temporary data will be stored in the cache for the particular disk. -In this section, you should specify the disk name with the type `cache`. -In that case, the cache and temporary data will share the same space, and the disk cache can be evicted to create temporary data. - -:::note -- Only one option can be used to configure temporary data storage: `tmp_path` ,`tmp_policy`, `temporary_data_in_cache`. -::: - -**Example** - -```xml - - - - - local - /local_disk/ - - - - cache - local_disk - /tiny_local_cache/ - 10M - 1M - 1 - 0 - - - - - tiny_local_cache - -``` - -Cache for `local_disk` and temporary data will be stored in `/tiny_local_cache` on the filesystem, managed by `tiny_local_cache`. - -## max_temporary_data_on_disk_size {#max_temporary_data_on_disk_size} - -Limit the amount of disk space consumed by temporary files in `tmp_path` for the server. -Queries that exceed this limit will fail with an exception. - -Default value: `0`. - -**See also** - -- [max_temporary_data_on_disk_size_for_user](../../operations/settings/query-complexity.md#settings_max_temporary_data_on_disk_size_for_user) -- [max_temporary_data_on_disk_size_for_query](../../operations/settings/query-complexity.md#settings_max_temporary_data_on_disk_size_for_query) -- [tmp_path](#tmp-path) -- [tmp_policy](#tmp-policy) -- [max_server_memory_usage](#max_server_memory_usage) - -## uncompressed_cache_size {#server-settings-uncompressed_cache_size} - -Cache size (in bytes) for uncompressed data used by table engines from the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). - -There is one shared cache for the server. Memory is allocated on demand. The cache is used if the option [use_uncompressed_cache](../../operations/settings/settings.md#setting-use_uncompressed_cache) is enabled. - -The uncompressed cache is advantageous for very short queries in individual cases. - -**Example** - -``` xml -8589934592 -``` ## user_files_path {#server_configuration_parameters-user_files_path} @@ -1953,24 +2182,6 @@ Data part headers already stored with this setting can't be restored to their pr **Default value:** 0. -## disable_internal_dns_cache {#server-settings-disable-internal-dns-cache} - -Disables the internal DNS cache. Recommended for operating ClickHouse in systems -with frequently changing infrastructure such as Kubernetes. - -**Default value:** 0. - -## dns_cache_update_period {#server-settings-dns-cache-update-period} - -The period of updating IP addresses stored in the ClickHouse internal DNS cache (in seconds). -The update is performed asynchronously, in a separate system thread. - -**Default value**: 15. - -**See also** - -- [background_schedule_pool_size](../../operations/settings/settings.md#background_schedule_pool_size) - ## distributed_ddl {#server-settings-distributed_ddl} Manage executing [distributed ddl queries](../../sql-reference/distributed-ddl.md) (CREATE, DROP, ALTER, RENAME) on cluster. @@ -2100,16 +2311,6 @@ Possible values: Default value: `0`. -## mmap_cache_size {#mmap-cache-size} - -Sets the cache size (in bytes) for mapped files. This setting allows to avoid frequent open/[mmap/munmap](https://en.wikipedia.org/wiki/Mmap)/close calls (which are very expensive due to consequent page faults) and to reuse mappings from several threads and queries. The setting value is the number of mapped regions (usually equal to the number of mapped files). The amount of data in mapped files can be monitored in [system.metrics](../../operations/system-tables/metrics.md), [system.metric_log](../../operations/system-tables/metric_log.md) system tables by the `MMappedFiles` and `MMappedFileBytes` metrics, in [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md), [system.asynchronous_metrics_log](../../operations/system-tables/asynchronous_metric_log.md) by the `MMapCacheCells` metric, and also in [system.events](../../operations/system-tables/events.md), [system.processes](../../operations/system-tables/processes.md), [system.query_log](../../operations/system-tables/query_log.md), [system.query_thread_log](../../operations/system-tables/query_thread_log.md), [system.query_views_log](../../operations/system-tables/query_views_log.md) by the `CreatedReadBufferMMap`, `CreatedReadBufferMMapFailed`, `MMappedFileCacheHits`, `MMappedFileCacheMisses` events. Note that the amount of data in mapped files does not consume memory directly and is not accounted in query or server memory usage — because this memory can be discarded similar to OS page cache. The cache is dropped (the files are closed) automatically on the removal of old parts in tables of the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) family, also it can be dropped manually by the `SYSTEM DROP MMAP CACHE` query. - -Possible values: - -- Positive integer. - -Default value: `1000`. - ## compiled_expression_cache_size {#compiled-expression-cache-size} Sets the cache size (in bytes) for [compiled expressions](../../operations/caches.md).