From 2c40dd6a4c6aaa956762b8709fe5e4686e16876c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 2 May 2023 14:36:02 +0200 Subject: [PATCH 1/9] Switch Block::NameMap to google::dense_hash_map over HashMap Since HashMap creates 2^8 elements by default, while dense_hash_map should be good here. Signed-off-by: Azat Khuzhin --- src/Access/AuthenticationData.cpp | 1 + src/AggregateFunctions/AggregateFunctionForEach.h | 1 + src/AggregateFunctions/AggregateFunctionMinMaxAny.h | 1 + src/CMakeLists.txt | 3 ++- src/Core/Block.cpp | 6 ++---- src/Core/Block.h | 6 ++---- .../SerializationAggregateFunction.cpp | 1 + src/Formats/ColumnMapping.cpp | 6 +++--- src/Functions/FunctionBinaryArithmetic.h | 1 + src/Interpreters/QueryLog.h | 1 + src/Interpreters/QueryViewsLog.h | 1 + src/Interpreters/tests/gtest_context_race.cpp | 1 + .../Formats/Impl/BSONEachRowRowInputFormat.cpp | 12 +++++++----- .../Formats/Impl/BSONEachRowRowInputFormat.h | 2 +- .../Impl/JSONColumnsBlockInputFormatBase.cpp | 2 +- .../Formats/Impl/JSONEachRowRowInputFormat.cpp | 13 ++++++------- .../Formats/Impl/JSONEachRowRowInputFormat.h | 5 ++--- .../QueryPlan/DistributedCreateLocalPlan.h | 1 + src/Storages/DataLakes/DeltaLakeMetadataParser.cpp | 5 +++-- src/Storages/DataLakes/HudiMetadataParser.cpp | 1 + src/Storages/DataLakes/IcebergMetadataParser.cpp | 1 + src/Storages/StorageS3.h | 1 + 22 files changed, 41 insertions(+), 31 deletions(-) diff --git a/src/Access/AuthenticationData.cpp b/src/Access/AuthenticationData.cpp index 409338209cc..3bb0be160f4 100644 --- a/src/Access/AuthenticationData.cpp +++ b/src/Access/AuthenticationData.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/src/AggregateFunctions/AggregateFunctionForEach.h b/src/AggregateFunctions/AggregateFunctionForEach.h index 81ba298bb8a..480b4cc690e 100644 --- a/src/AggregateFunctions/AggregateFunctionForEach.h +++ b/src/AggregateFunctions/AggregateFunctionForEach.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include diff --git a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h index c2cf2ac418f..94c0d60be81 100644 --- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h +++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b3f4fbb7420..c04b7acad3d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -211,6 +211,7 @@ endif() if (TARGET ch_contrib::jemalloc) target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::jemalloc) endif() +target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::sparsehash) add_subdirectory(Access/Common) add_subdirectory(Common/ZooKeeper) @@ -463,7 +464,7 @@ endif () if (TARGET ch_contrib::ldap) dbms_target_link_libraries (PRIVATE ch_contrib::ldap ch_contrib::lber) endif () -dbms_target_link_libraries (PRIVATE ch_contrib::sparsehash) +dbms_target_link_libraries (PUBLIC ch_contrib::sparsehash) if (TARGET ch_contrib::protobuf) dbms_target_link_libraries (PRIVATE ch_contrib::protobuf) diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 456f1d5d95e..a3bd29faab1 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -663,12 +663,10 @@ Names Block::getDataTypeNames() const Block::NameMap Block::getNamesToIndexesMap() const { - NameMap res; - res.reserve(index_by_name.size()); - + NameMap res(index_by_name.size()); + res.set_empty_key(StringRef{}); for (const auto & [name, index] : index_by_name) res[name] = index; - return res; } diff --git a/src/Core/Block.h b/src/Core/Block.h index eb9d57ea6f8..7eed48d3d9f 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -5,13 +5,11 @@ #include #include -#include - #include #include -#include #include #include +#include namespace DB @@ -97,7 +95,7 @@ public: Names getDataTypeNames() const; /// Hash table match `column name -> position in the block`. - using NameMap = HashMap; + using NameMap = ::google::dense_hash_map; NameMap getNamesToIndexesMap() const; Serializations getSerializations() const; diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp index 7e192595114..c482c9623e9 100644 --- a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include diff --git a/src/Formats/ColumnMapping.cpp b/src/Formats/ColumnMapping.cpp index 2fab5766591..e33dfc878f4 100644 --- a/src/Formats/ColumnMapping.cpp +++ b/src/Formats/ColumnMapping.cpp @@ -26,8 +26,8 @@ void ColumnMapping::addColumns( { names_of_columns.push_back(name); - const auto * column_it = column_indexes_by_names.find(name); - if (!column_it) + const auto column_it = column_indexes_by_names.find(name); + if (column_it == column_indexes_by_names.end()) { if (settings.skip_unknown_fields) { @@ -43,7 +43,7 @@ void ColumnMapping::addColumns( name, column_indexes_for_input_fields.size()); } - const auto column_index = column_it->getMapped(); + const auto column_index = column_it->second; if (read_columns[column_index]) throw Exception(ErrorCodes::INCORRECT_DATA, "Duplicate field found while parsing format header: {}", name); diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index a83bc6382b6..0174899d767 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include diff --git a/src/Interpreters/QueryLog.h b/src/Interpreters/QueryLog.h index a2d434e8843..44780f530e0 100644 --- a/src/Interpreters/QueryLog.h +++ b/src/Interpreters/QueryLog.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include diff --git a/src/Interpreters/QueryViewsLog.h b/src/Interpreters/QueryViewsLog.h index 986311fc822..e28bce0b91c 100644 --- a/src/Interpreters/QueryViewsLog.h +++ b/src/Interpreters/QueryViewsLog.h @@ -13,6 +13,7 @@ #include #include #include +#include namespace ProfileEvents { diff --git a/src/Interpreters/tests/gtest_context_race.cpp b/src/Interpreters/tests/gtest_context_race.cpp index 60531494592..ec61fc9467c 100644 --- a/src/Interpreters/tests/gtest_context_race.cpp +++ b/src/Interpreters/tests/gtest_context_race.cpp @@ -1,6 +1,7 @@ #include #include #include +#include using namespace DB; diff --git a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp index 370cddd2dcb..57598fb507f 100644 --- a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp @@ -64,20 +64,22 @@ inline size_t BSONEachRowRowInputFormat::columnIndex(const StringRef & name, siz /// Optimization by caching the order of fields (which is almost always the same) /// and a quick check to match the next expected field, instead of searching the hash table. - if (prev_positions.size() > key_index && prev_positions[key_index] && name == prev_positions[key_index]->getKey()) + if (prev_positions.size() > key_index + && prev_positions[key_index] != Block::NameMap::const_iterator{} + && name == prev_positions[key_index]->first) { - return prev_positions[key_index]->getMapped(); + return prev_positions[key_index]->second; } else { - auto * it = name_map.find(name); + const auto it = name_map.find(name); - if (it) + if (it != name_map.end()) { if (key_index < prev_positions.size()) prev_positions[key_index] = it; - return it->getMapped(); + return it->second; } else return UNKNOWN_FIELD; diff --git a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.h index ad6d712b6dd..538a59e05c3 100644 --- a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.h @@ -91,7 +91,7 @@ private: Block::NameMap name_map; /// Cached search results for previous row (keyed as index in JSON object) - used as a hint. - std::vector prev_positions; + std::vector prev_positions; DataTypes types; diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp index 3d052e23c21..2e264c59f56 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp @@ -128,7 +128,7 @@ Chunk JSONColumnsBlockInputFormatBase::generate() { /// Check if this name appears in header. If no, skip this column or throw /// an exception according to setting input_format_skip_unknown_fields - if (!name_to_index.has(*column_name)) + if (name_to_index.find(*column_name) == name_to_index.end()) { if (!format_settings.skip_unknown_fields) throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown column found in input data: {}", *column_name); diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 22ac31c7824..e5f52936021 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -71,21 +71,20 @@ inline size_t JSONEachRowRowInputFormat::columnIndex(StringRef name, size_t key_ /// and a quick check to match the next expected field, instead of searching the hash table. if (prev_positions.size() > key_index - && prev_positions[key_index] - && name == prev_positions[key_index]->getKey()) + && prev_positions[key_index] != Block::NameMap::const_iterator{} + && name == prev_positions[key_index]->first) { - return prev_positions[key_index]->getMapped(); + return prev_positions[key_index]->second; } else { - auto * it = name_map.find(name); - - if (it) + const auto it = name_map.find(name); + if (it != name_map.end()) { if (key_index < prev_positions.size()) prev_positions[key_index] = it; - return it->getMapped(); + return it->second; } else return UNKNOWN_FIELD; diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index beee9e95821..ce42071585e 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -71,11 +71,10 @@ private: /// for row like {..., "non-nullable column name" : null, ...} /// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map. - using NameMap = HashMap; - NameMap name_map; + Block::NameMap name_map; /// Cached search results for previous row (keyed as index in JSON object) - used as a hint. - std::vector prev_positions; + std::vector prev_positions; bool allow_new_rows = true; diff --git a/src/Processors/QueryPlan/DistributedCreateLocalPlan.h b/src/Processors/QueryPlan/DistributedCreateLocalPlan.h index 16bf1c565ff..1afdc07fa4d 100644 --- a/src/Processors/QueryPlan/DistributedCreateLocalPlan.h +++ b/src/Processors/QueryPlan/DistributedCreateLocalPlan.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include diff --git a/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp b/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp index 45189ca325a..309aa54909a 100644 --- a/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp +++ b/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp @@ -11,10 +11,11 @@ #include #include #include -#include -#include #include #include +#include +#include +#include #include namespace fs = std::filesystem; diff --git a/src/Storages/DataLakes/HudiMetadataParser.cpp b/src/Storages/DataLakes/HudiMetadataParser.cpp index 74e556f2bf2..a1f35a5ae42 100644 --- a/src/Storages/DataLakes/HudiMetadataParser.cpp +++ b/src/Storages/DataLakes/HudiMetadataParser.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include "config.h" #include diff --git a/src/Storages/DataLakes/IcebergMetadataParser.cpp b/src/Storages/DataLakes/IcebergMetadataParser.cpp index 4e90da41721..3ab90e271cf 100644 --- a/src/Storages/DataLakes/IcebergMetadataParser.cpp +++ b/src/Storages/DataLakes/IcebergMetadataParser.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index c3b862f6bbd..12573ab513f 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include From fcce60ad25243f128d97a264cdec4c025e67a1b3 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Fri, 12 May 2023 15:09:34 +0200 Subject: [PATCH 2/9] Randomize enable_multiple_prewhere_read_steps --- tests/clickhouse-test | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index e279b899a93..09bc15fbe3d 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -530,6 +530,7 @@ class SettingsRandomizer: "max_threads": lambda: random.randint(1, 64), "optimize_or_like_chain": lambda: random.randint(0, 1), "optimize_read_in_order": lambda: random.randint(0, 1), + "enable_multiple_prewhere_read_steps": lambda: random.randint(0, 1), "read_in_order_two_level_merge_threshold": lambda: random.randint(0, 100), "optimize_aggregation_in_order": lambda: random.randint(0, 1), "aggregation_in_order_max_block_bytes": lambda: random.randint(0, 50000000), From 1e3b7af97a5845c0dd02299a40e65bad5e6468fb Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 12 May 2023 10:26:05 -0300 Subject: [PATCH 3/9] Add setting to limit the max number of pairs produced by extractKeyValuePairs --- src/Common/ErrorCodes.cpp | 1 + src/Core/Settings.h | 1 + .../keyvaluepair/extractKeyValuePairs.cpp | 13 ++++++-- .../impl/CHKeyValuePairExtractor.h | 15 +++++++--- .../impl/KeyValuePairExtractorBuilder.cpp | 14 ++++++--- .../impl/KeyValuePairExtractorBuilder.h | 3 ++ ...t_key_value_pairs_multiple_input.reference | 16 ++++++++++ ...extract_key_value_pairs_multiple_input.sql | 30 +++++++++++++++++++ 8 files changed, 82 insertions(+), 11 deletions(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index e9dc5649245..382b8ed8019 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -576,6 +576,7 @@ M(691, UNKNOWN_ELEMENT_OF_ENUM) \ M(692, TOO_MANY_MUTATIONS) \ M(693, AWS_ERROR) \ + M(694, TOO_LARGE_MAP_SIZE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f60632fae91..3d1640e21b1 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -748,6 +748,7 @@ class IColumn; M(Bool, optimize_distinct_in_order, false, "This optimization has a bug and it is disabled. Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \ M(Bool, allow_experimental_undrop_table_query, false, "Allow to use undrop query to restore dropped table in a limited time", 0) \ M(Bool, keeper_map_strict_mode, false, "Enforce additional checks during operations on KeeperMap. E.g. throw an exception on an insert for already existing key", 0) \ + M(UInt64, extract_kvp_max_pairs_per_row, 1000, "Max number pairs that can be produced by extractKeyValuePairs function. Used to safeguard against consuming too much memory.", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. diff --git a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp index a1b140001ac..00588052870 100644 --- a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp +++ b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp @@ -7,6 +7,8 @@ #include #include +#include + #include #include #include @@ -41,6 +43,8 @@ class ExtractKeyValuePairs : public IFunction builder.withQuotingCharacter(parsed_arguments.quoting_character.value()); } + builder.withMaxNumberOfPairs(context->getSettingsRef().extract_kvp_max_pairs_per_row); + return builder.build(); } @@ -73,7 +77,7 @@ class ExtractKeyValuePairs : public IFunction } public: - ExtractKeyValuePairs() = default; + explicit ExtractKeyValuePairs(ContextPtr context_) : context(context_) {} static constexpr auto name = Name::name; @@ -82,9 +86,9 @@ public: return name; } - static FunctionPtr create(ContextPtr) + static FunctionPtr create(ContextPtr context) { - return std::make_shared(); + return std::make_shared(context); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override @@ -120,6 +124,9 @@ public: { return {1, 2, 3, 4}; } + +private: + ContextPtr context; }; struct NameExtractKeyValuePairs diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index 139417579de..24c8d3cd89a 100644 --- a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -13,6 +13,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int TOO_LARGE_MAP_SIZE; } /* @@ -25,8 +26,8 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor using NextState = DB::extractKV::StateHandler::NextState; public: - explicit CHKeyValuePairExtractor(StateHandler state_handler_) - : state_handler(std::move(state_handler_)) + explicit CHKeyValuePairExtractor(StateHandler state_handler_, uint64_t max_number_of_pairs_) + : state_handler(std::move(state_handler_)), max_number_of_pairs(max_number_of_pairs_) {} uint64_t extract(const std::string & data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override @@ -113,11 +114,16 @@ private: NextState flushPair(const std::string_view & file, auto & key, auto & value, uint64_t & row_offset) { + row_offset++; + + if (row_offset > max_number_of_pairs) + { + throw Exception(ErrorCodes::TOO_LARGE_MAP_SIZE, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs); + } + key.commit(); value.commit(); - row_offset++; - return {0, file.empty() ? State::END : State::WAITING_KEY}; } @@ -128,6 +134,7 @@ private: } StateHandler state_handler; + uint64_t max_number_of_pairs; }; } diff --git a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp index 188285dc4bd..7f2a6449ab0 100644 --- a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp @@ -31,6 +31,12 @@ KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withEscaping() return *this; } +KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withMaxNumberOfPairs(uint64_t max_number_of_pairs_) +{ + max_number_of_pairs = max_number_of_pairs_; + return *this; +} + std::shared_ptr KeyValuePairExtractorBuilder::build() const { if (with_escaping) @@ -46,9 +52,9 @@ namespace using namespace extractKV; template -auto makeStateHandler(const T && handler) +auto makeStateHandler(const T && handler, uint64_t max_number_of_pairs) { - return std::make_shared>(handler); + return std::make_shared>(handler, max_number_of_pairs); } } @@ -57,14 +63,14 @@ std::shared_ptr KeyValuePairExtractorBuilder::buildWithou { auto configuration = ConfigurationFactory::createWithoutEscaping(key_value_delimiter, quoting_character, item_delimiters); - return makeStateHandler(NoEscapingStateHandler(configuration)); + return makeStateHandler(NoEscapingStateHandler(configuration), max_number_of_pairs); } std::shared_ptr KeyValuePairExtractorBuilder::buildWithEscaping() const { auto configuration = ConfigurationFactory::createWithEscaping(key_value_delimiter, quoting_character, item_delimiters); - return makeStateHandler(InlineEscapingStateHandler(configuration)); + return makeStateHandler(InlineEscapingStateHandler(configuration), max_number_of_pairs); } } diff --git a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h index 5746c58dccc..0c673f12ccf 100644 --- a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h @@ -20,6 +20,8 @@ public: KeyValuePairExtractorBuilder & withEscaping(); + KeyValuePairExtractorBuilder & withMaxNumberOfPairs(uint64_t max_number_of_pairs_); + std::shared_ptr build() const; private: @@ -27,6 +29,7 @@ private: char key_value_delimiter = ':'; char quoting_character = '"'; std::vector item_delimiters = {' ', ',', ';'}; + uint64_t max_number_of_pairs = std::numeric_limits::max(); std::shared_ptr buildWithEscaping() const; diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference index 125afa19427..ec51b61a382 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference @@ -292,6 +292,22 @@ SELECT x; {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} -- { echoOn } + +SET extract_kvp_max_pairs_per_row = 2; +-- Should be allowed because it no longer exceeds the max number of pairs +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extractKeyValuePairs('key1:value1,key2:value2') AS s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; +{'key1':'value1','key2':'value2'} +SET extract_kvp_max_pairs_per_row = 99999; -- should not fail because pair delimiters contains 8 characters, which is within the limit WITH extractKeyValuePairs('not_important', ':', '12345678', '\'') AS s_map, diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql index 0a1500b1796..7dfcae879b0 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -414,7 +414,37 @@ WITH SELECT x; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +-- Should fail allowed because it exceeds the max number of pairs +SET extract_kvp_max_pairs_per_row = 1; +WITH + extractKeyValuePairs('key1:value1,key2:value2') AS s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; -- {serverError TOO_LARGE_MAP_SIZE} + -- { echoOn } + +SET extract_kvp_max_pairs_per_row = 2; +-- Should be allowed because it no longer exceeds the max number of pairs +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extractKeyValuePairs('key1:value1,key2:value2') AS s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; + +SET extract_kvp_max_pairs_per_row = 99999; + -- should not fail because pair delimiters contains 8 characters, which is within the limit WITH extractKeyValuePairs('not_important', ':', '12345678', '\'') AS s_map, From b1549a19a50048a93f5cc0b48c482160fcb66613 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 12 May 2023 11:19:35 -0300 Subject: [PATCH 4/9] Use 0 as unlimited --- .../keyvaluepair/extractKeyValuePairs.cpp | 7 ++++++- ...tract_key_value_pairs_multiple_input.reference | 15 ++++++++++++++- ...499_extract_key_value_pairs_multiple_input.sql | 14 +++++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp index 00588052870..9956eaf01f1 100644 --- a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp +++ b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp @@ -43,7 +43,12 @@ class ExtractKeyValuePairs : public IFunction builder.withQuotingCharacter(parsed_arguments.quoting_character.value()); } - builder.withMaxNumberOfPairs(context->getSettingsRef().extract_kvp_max_pairs_per_row); + bool is_number_of_pairs_unlimited = context->getSettingsRef().extract_kvp_max_pairs_per_row == 0; + + if (!is_number_of_pairs_unlimited) + { + builder.withMaxNumberOfPairs(context->getSettingsRef().extract_kvp_max_pairs_per_row); + } return builder.build(); } diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference index ec51b61a382..a4ff1464fb8 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference @@ -307,7 +307,20 @@ WITH SELECT x; {'key1':'value1','key2':'value2'} -SET extract_kvp_max_pairs_per_row = 99999; +SET extract_kvp_max_pairs_per_row = 0; +-- Should be allowed because max pairs per row is set to 0 (unlimited) +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extractKeyValuePairs('key1:value1,key2:value2') AS s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; +{'key1':'value1','key2':'value2'} -- should not fail because pair delimiters contains 8 characters, which is within the limit WITH extractKeyValuePairs('not_important', ':', '12345678', '\'') AS s_map, diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql index 7dfcae879b0..8c5f0365cc7 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -443,7 +443,19 @@ WITH SELECT x; -SET extract_kvp_max_pairs_per_row = 99999; +SET extract_kvp_max_pairs_per_row = 0; +-- Should be allowed because max pairs per row is set to 0 (unlimited) +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extractKeyValuePairs('key1:value1,key2:value2') AS s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; -- should not fail because pair delimiters contains 8 characters, which is within the limit WITH From 2b793e3a14a850197ffbb23b76389b54f565586c Mon Sep 17 00:00:00 2001 From: darkkeks Date: Sat, 13 May 2023 16:22:17 +0300 Subject: [PATCH 5/9] [docs] Remove "example" section from date-time-functions page toc --- .../functions/date-time-functions.md | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 599b64ef43f..4710a5617c3 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1218,12 +1218,16 @@ Rounds the time to the half hour. Converts a date or date with time to a UInt32 number containing the year and month number (YYYY \* 100 + MM). Accepts a second optional timezone argument. If provided, the timezone must be a string constant. -### example -```sql +**Example** + +``` sql SELECT toYYYYMM(now(), 'US/Eastern') ``` -```response + +Result: + +``` text ┌─toYYYYMM(now(), 'US/Eastern')─┐ │ 202303 │ └───────────────────────────────┘ @@ -1233,11 +1237,15 @@ SELECT Converts a date or date with time to a UInt32 number containing the year and month number (YYYY \* 10000 + MM \* 100 + DD). Accepts a second optional timezone argument. If provided, the timezone must be a string constant. -### example +**Example** + ```sql SELECT toYYYYMMDD(now(), 'US/Eastern') ``` + +Result: + ```response ┌─toYYYYMMDD(now(), 'US/Eastern')─┐ │ 20230302 │ @@ -1248,11 +1256,15 @@ SELECT Converts a date or date with time to a UInt64 number containing the year and month number (YYYY \* 10000000000 + MM \* 100000000 + DD \* 1000000 + hh \* 10000 + mm \* 100 + ss). Accepts a second optional timezone argument. If provided, the timezone must be a string constant. -### example +**Example** + ```sql SELECT toYYYYMMDDhhmmss(now(), 'US/Eastern') ``` + +Result: + ```response ┌─toYYYYMMDDhhmmss(now(), 'US/Eastern')─┐ │ 20230302112209 │ From 1db35384d969f9239de84563927494148100c6d5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 May 2023 03:30:03 +0200 Subject: [PATCH 6/9] Support `bitCount` for big integers --- src/Functions/bitCount.cpp | 9 ++++++++- .../02736_bit_count_big_int.reference | 13 +++++++++++++ .../0_stateless/02736_bit_count_big_int.sql | 19 +++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02736_bit_count_big_int.reference create mode 100644 tests/queries/0_stateless/02736_bit_count_big_int.sql diff --git a/src/Functions/bitCount.cpp b/src/Functions/bitCount.cpp index 984f33b7001..566a11481be 100644 --- a/src/Functions/bitCount.cpp +++ b/src/Functions/bitCount.cpp @@ -9,7 +9,7 @@ namespace DB template struct BitCountImpl { - using ResultType = UInt8; + using ResultType = std::conditional_t<(sizeof(A) * 8 >= 256), UInt16, UInt8>; static constexpr bool allow_string_or_fixed_string = true; static inline ResultType apply(A a) @@ -17,6 +17,13 @@ struct BitCountImpl /// We count bits in the value representation in memory. For example, we support floats. /// We need to avoid sign-extension when converting signed numbers to larger type. So, uint8_t(-1) has 8 bits. + if constexpr (is_big_int_v) + { + ResultType res = 0; + for (auto item : a.items) + res += __builtin_popcountll(item); + return res; + } if constexpr (std::is_same_v || std::is_same_v) return __builtin_popcountll(a); if constexpr (std::is_same_v || std::is_same_v || std::is_unsigned_v) diff --git a/tests/queries/0_stateless/02736_bit_count_big_int.reference b/tests/queries/0_stateless/02736_bit_count_big_int.reference new file mode 100644 index 00000000000..a3a725ace69 --- /dev/null +++ b/tests/queries/0_stateless/02736_bit_count_big_int.reference @@ -0,0 +1,13 @@ +128 +256 +128 +256 +127 +255 +126 +255 +64 +UInt8 +UInt16 +UInt8 +UInt16 diff --git a/tests/queries/0_stateless/02736_bit_count_big_int.sql b/tests/queries/0_stateless/02736_bit_count_big_int.sql new file mode 100644 index 00000000000..35a4a641606 --- /dev/null +++ b/tests/queries/0_stateless/02736_bit_count_big_int.sql @@ -0,0 +1,19 @@ +SELECT bitCount(CAST(-1 AS UInt128)); +SELECT bitCount(CAST(-1 AS UInt256)); + +SELECT bitCount(CAST(-1 AS Int128)); +SELECT bitCount(CAST(-1 AS Int256)); + +SELECT bitCount(CAST(-1 AS UInt128) - 1); +SELECT bitCount(CAST(-1 AS UInt256) - 2); + +SELECT bitCount(CAST(-1 AS Int128) - 3); +SELECT bitCount(CAST(-1 AS Int256) - 4); + +SELECT bitCount(CAST(0xFFFFFFFFFFFFFFFF AS Int256)); + +SELECT toTypeName(bitCount(1::UInt128)); +SELECT toTypeName(bitCount(1::UInt256)); + +SELECT toTypeName(bitCount(1::Int128)); +SELECT toTypeName(bitCount(1::Int256)); From 4764259f609a024f04ec8c082b63a495efc66d70 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Mon, 15 May 2023 14:17:16 +0800 Subject: [PATCH 7/9] Fix a bug with projections and the aggregate_functions_null_for_empty setting (for query_plan_optimize_projection) Fix a bug with projections and the aggregate_functions_null_for_empty setting. This was already fixed in PR #42198 but got forgotten after using query_plan_optimize_projection. --- .../QueryPlan/Optimizations/projectionsCommon.cpp | 4 ++++ ...rojection_aggregate_functions_null_for_empty.reference | 1 + ...1710_projection_aggregate_functions_null_for_empty.sql | 8 ++++++++ 3 files changed, 13 insertions(+) create mode 100644 tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.reference create mode 100644 tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.sql diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp index a334450fb41..9252137f649 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp @@ -42,6 +42,10 @@ bool canUseProjectionForReadingStep(ReadFromMergeTree * reading) if (reading->getContext()->getSettingsRef().allow_experimental_query_deduplication) return false; + // Currently projection don't support settings which implicitly modify aggregate functions. + if (reading->getContext()->getSettingsRef().aggregate_functions_null_for_empty) + return false; + return true; } diff --git a/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.reference b/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.reference new file mode 100644 index 00000000000..f2a527c4d8d --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.reference @@ -0,0 +1 @@ +1554690688 diff --git a/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.sql b/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.sql new file mode 100644 index 00000000000..a77720b6580 --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.sql @@ -0,0 +1,8 @@ +DROP TABLE IF EXISTS t1; + +CREATE TABLE t1 (c0 Int32, PRIMARY KEY (c0)) ENGINE=MergeTree; +INSERT INTO t1 VALUES (1554690688); + +SELECT MIN(t1.c0) FROM t1 SETTINGS aggregate_functions_null_for_empty = 1; + +DROP TABLE IF EXISTS t1; From 8d1bcb5c2f23ea0e29c61c8cb3f584586f5f8834 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Mon, 15 May 2023 16:28:28 +0800 Subject: [PATCH 8/9] fix typo --- src/Storages/StorageMongoDB.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageMongoDB.h b/src/Storages/StorageMongoDB.h index 682a027440d..2b77f076e7e 100644 --- a/src/Storages/StorageMongoDB.h +++ b/src/Storages/StorageMongoDB.h @@ -7,7 +7,7 @@ namespace DB { /* Implements storage in the MongoDB database. - * Use ENGINE = mysql(host_port, database_name, table_name, user_name, password) + * Use ENGINE = MongoDB(host:port, database, collection, user, password [, options]); * Read only. */ From e8f971aa2bdfcdfb12e0091a8aa6a7eb4e96ff8c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 15 May 2023 09:25:10 -0300 Subject: [PATCH 9/9] use LIMIT_EXCEEDED instead of TOO_LARGE_MAP_SIZE --- src/Common/ErrorCodes.cpp | 1 - src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h | 4 ++-- .../02499_extract_key_value_pairs_multiple_input.sql | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 382b8ed8019..e9dc5649245 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -576,7 +576,6 @@ M(691, UNKNOWN_ELEMENT_OF_ENUM) \ M(692, TOO_MANY_MUTATIONS) \ M(693, AWS_ERROR) \ - M(694, TOO_LARGE_MAP_SIZE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index 24c8d3cd89a..3895cf3e77d 100644 --- a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -13,7 +13,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; - extern const int TOO_LARGE_MAP_SIZE; + extern const int LIMIT_EXCEEDED; } /* @@ -118,7 +118,7 @@ private: if (row_offset > max_number_of_pairs) { - throw Exception(ErrorCodes::TOO_LARGE_MAP_SIZE, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs); + throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs); } key.commit(); diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql index 8c5f0365cc7..38d09338774 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -425,7 +425,7 @@ WITH 'Map(String,String)' ) AS x SELECT - x; -- {serverError TOO_LARGE_MAP_SIZE} + x; -- {serverError LIMIT_EXCEEDED} -- { echoOn }