From 8ef07bac89120e7d1e35b2ad61815f5b17384e59 Mon Sep 17 00:00:00 2001 From: Alfonso Martinez Date: Tue, 25 Oct 2022 17:40:54 +0200 Subject: [PATCH 01/78] Failing case corrected and test added --- src/Functions/FunctionsJSON.cpp | 28 ++++++++++++++----- ...xtract_low_cardinality_from_json.reference | 2 ++ ...2473_extract_low_cardinality_from_json.sql | 3 ++ 3 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/02473_extract_low_cardinality_from_json.reference create mode 100644 tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index 2f0a0889847..9e44de2cb52 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -1303,13 +1303,27 @@ public: static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) { - ColumnString & col_str = assert_cast(dest); - auto & chars = col_str.getChars(); - WriteBufferFromVector buf(chars, AppendModeTag()); - traverse(element, buf); - buf.finalize(); - chars.push_back(0); - col_str.getOffsets().push_back(chars.size()); + if (dest.getDataType() == TypeIndex::LowCardinality) + { + ColumnString::Chars chars; + WriteBufferFromVector buf(chars, AppendModeTag()); + chars.push_back(0); + traverse(element, buf); + buf.finalize(); + std::string str = reinterpret_cast(chars.data()); + chars.push_back(0); + assert_cast(dest).insertData(str.data(), str.size()); + } + else + { + ColumnString & col_str = assert_cast(dest); + auto & chars = col_str.getChars(); + WriteBufferFromVector buf(chars, AppendModeTag()); + traverse(element, buf); + buf.finalize(); + chars.push_back(0); + col_str.getOffsets().push_back(chars.size()); + } return true; } diff --git a/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.reference b/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.reference new file mode 100644 index 00000000000..73eab066599 --- /dev/null +++ b/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.reference @@ -0,0 +1,2 @@ +('{"b":{"c":1,"d":"str"}}','','','') +('{"b":{"c":1,"d":"str"}}','','','') diff --git a/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql b/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql new file mode 100644 index 00000000000..c9810c77720 --- /dev/null +++ b/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql @@ -0,0 +1,3 @@ +-- Tags: no-fasttest +SELECT JSONExtract('{"a" : {"b" : {"c" : 1, "d" : "str"}}}', 'Tuple( a LowCardinality(String), b LowCardinality(String), c LowCardinality(String), d LowCardinality(String))'); +SELECT JSONExtract('{"a" : {"b" : {"c" : 1, "d" : "str"}}}', 'Tuple( a String, b LowCardinality(String), c LowCardinality(String), d LowCardinality(String))'); \ No newline at end of file From 241049763bedfeeef29e62c7c5b8698dd7fc4a7f Mon Sep 17 00:00:00 2001 From: Alfonso Martinez Date: Wed, 26 Oct 2022 11:50:45 +0200 Subject: [PATCH 02/78] Fixed failing tests --- src/Functions/FunctionsJSON.cpp | 17 ++++++++++++++--- .../02473_extract_low_cardinality_from_json.sql | 1 - ...2474_extract_fixedstring_from_json.reference | 10 ++++++++++ .../02474_extract_fixedstring_from_json.sql | 8 ++++++++ 4 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/02474_extract_fixedstring_from_json.reference create mode 100644 tests/queries/0_stateless/02474_extract_fixedstring_from_json.sql diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index 9e44de2cb52..d0fd6e2397c 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -1330,12 +1330,23 @@ public: // We use insertResultToFixedStringColumn in case we are inserting raw data in a FixedString column static bool insertResultToFixedStringColumn(IColumn & dest, const Element & element, std::string_view) { - ColumnFixedString & col_str = assert_cast(dest); - auto & chars = col_str.getChars(); + ColumnFixedString::Chars chars; WriteBufferFromVector buf(chars, AppendModeTag()); traverse(element, buf); buf.finalize(); - col_str.insertDefault(); + + auto & col_str = assert_cast(dest); + + if (chars.size() > col_str.getN()) + return false; + + chars.push_back(0); + std::string str = reinterpret_cast(chars.data()); + + auto padded_str = str + std::string(col_str.getN() - std::min(col_str.getN(), str.length()), '\0'); + col_str.insertData(str.data(), str.size()); + + return true; } diff --git a/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql b/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql index c9810c77720..0a0ae398227 100644 --- a/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql +++ b/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql @@ -1,3 +1,2 @@ --- Tags: no-fasttest SELECT JSONExtract('{"a" : {"b" : {"c" : 1, "d" : "str"}}}', 'Tuple( a LowCardinality(String), b LowCardinality(String), c LowCardinality(String), d LowCardinality(String))'); SELECT JSONExtract('{"a" : {"b" : {"c" : 1, "d" : "str"}}}', 'Tuple( a String, b LowCardinality(String), c LowCardinality(String), d LowCardinality(String))'); \ No newline at end of file diff --git a/tests/queries/0_stateless/02474_extract_fixedstring_from_json.reference b/tests/queries/0_stateless/02474_extract_fixedstring_from_json.reference new file mode 100644 index 00000000000..783d12fcf1a --- /dev/null +++ b/tests/queries/0_stateless/02474_extract_fixedstring_from_json.reference @@ -0,0 +1,10 @@ +\0\0\0\0\0\0\0\0\0\0\0 +{"a":123456} +\0\0\0\0\0 +123456 +\0\0\0\0\0 +123456 +\0\0\0\0\0 +\0\0\0\0\0 +131231 +131231 diff --git a/tests/queries/0_stateless/02474_extract_fixedstring_from_json.sql b/tests/queries/0_stateless/02474_extract_fixedstring_from_json.sql new file mode 100644 index 00000000000..cfc47e00cba --- /dev/null +++ b/tests/queries/0_stateless/02474_extract_fixedstring_from_json.sql @@ -0,0 +1,8 @@ +SELECT JSONExtract('{"a": 123456}', 'FixedString(11)'); +SELECT JSONExtract('{"a": 123456}', 'FixedString(12)'); +SELECT JSONExtract('{"a": "123456"}', 'a', 'FixedString(5)'); +SELECT JSONExtract('{"a": "123456"}', 'a', 'FixedString(6)'); +SELECT JSONExtract('{"a": 123456}', 'a', 'FixedString(5)'); +SELECT JSONExtract('{"a": 123456}', 'a', 'FixedString(6)'); +SELECT JSONExtract(materialize('{"a": 131231}'), 'a', 'LowCardinality(FixedString(5))') FROM numbers(2); +SELECT JSONExtract(materialize('{"a": 131231}'), 'a', 'LowCardinality(FixedString(6))') FROM numbers(2); From c37b1542545ab4a3362afdd266a531a7eeca00ff Mon Sep 17 00:00:00 2001 From: Alfonso Martinez Date: Fri, 28 Oct 2022 12:37:59 +0200 Subject: [PATCH 03/78] Added reverted files and fixes for failing fuzzer tests --- src/Functions/FunctionsJSON.cpp | 106 +++++++++++++----- .../performance/low_cardinality_from_json.xml | 73 ++++++++++++ .../02452_check_low_cardinality.reference | 7 ++ .../02452_check_low_cardinality.sql | 54 +++++++++ ...ct_fixed_string_from_nested_json.reference | 1 + ..._extract_fixed_string_from_nested_json.sql | 6 + 6 files changed, 219 insertions(+), 28 deletions(-) create mode 100644 tests/performance/low_cardinality_from_json.xml create mode 100644 tests/queries/0_stateless/02452_check_low_cardinality.reference create mode 100644 tests/queries/0_stateless/02452_check_low_cardinality.sql create mode 100644 tests/queries/0_stateless/02455_extract_fixed_string_from_nested_json.reference create mode 100644 tests/queries/0_stateless/02455_extract_fixed_string_from_nested_json.sql diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index ea2d2cc94af..64c5ff3c2c8 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -20,18 +20,19 @@ #include #include -#include -#include -#include -#include -#include -#include +#include #include #include +#include +#include #include #include -#include +#include #include +#include +#include +#include +#include #include #include @@ -695,8 +696,16 @@ public: else return false; - auto & col_vec = assert_cast &>(dest); - col_vec.insertValue(value); + if (dest.getDataType() == TypeIndex::LowCardinality) + { + ColumnLowCardinality & col_low = assert_cast(dest); + col_low.insertData(reinterpret_cast(&value), sizeof(value)); + } + else + { + auto & col_vec = assert_cast &>(dest); + col_vec.insertValue(value); + } return true; } }; @@ -773,8 +782,17 @@ public: return JSONExtractRawImpl::insertResultToColumn(dest, element, {}); auto str = element.getString(); - ColumnString & col_str = assert_cast(dest); - col_str.insertData(str.data(), str.size()); + + if (dest.getDataType() == TypeIndex::LowCardinality) + { + ColumnLowCardinality & col_low = assert_cast(dest); + col_low.insertData(str.data(), str.size()); + } + else + { + ColumnString & col_str = assert_cast(dest); + col_str.insertData(str.data(), str.size()); + } return true; } }; @@ -803,25 +821,33 @@ struct JSONExtractTree } }; - class LowCardinalityNode : public Node + class LowCardinalityFixedStringNode : public Node { public: - LowCardinalityNode(DataTypePtr dictionary_type_, std::unique_ptr impl_) - : dictionary_type(dictionary_type_), impl(std::move(impl_)) {} + explicit LowCardinalityFixedStringNode(const size_t fixed_length_) : fixed_length(fixed_length_) { } bool insertResultToColumn(IColumn & dest, const Element & element) override { - auto from_col = dictionary_type->createColumn(); - if (impl->insertResultToColumn(*from_col, element)) - { - std::string_view value = from_col->getDataAt(0).toView(); - assert_cast(dest).insertData(value.data(), value.size()); - return true; - } - return false; + // If element is an object we delegate the insertion to JSONExtractRawImpl + if (element.isObject()) + return JSONExtractRawImpl::insertResultToLowCardinalityFixedStringColumn(dest, element, fixed_length); + else if (!element.isString()) + return false; + + auto str = element.getString(); + if (str.size() > fixed_length) + return false; + + // For the non low cardinality case of FixedString, the padding is done in the FixedString Column implementation. + // In order to avoid having to pass the data to a FixedString Column and read it back (which would slow down the execution) + // the data is padded here and written directly to the Low Cardinality Column + auto padded_str = str.data() + std::string(fixed_length - std::min(fixed_length, str.length()), '\0'); + + assert_cast(dest).insertData(padded_str.data(), padded_str.size()); + return true; } + private: - DataTypePtr dictionary_type; - std::unique_ptr impl; + const size_t fixed_length; }; class UUIDNode : public Node @@ -833,7 +859,15 @@ struct JSONExtractTree return false; auto uuid = parseFromString(element.getString()); - assert_cast(dest).insert(uuid); + if (dest.getDataType() == TypeIndex::LowCardinality) + { + ColumnLowCardinality & col_low = assert_cast(dest); + col_low.insertData(reinterpret_cast(&uuid), sizeof(uuid)); + } + else + { + assert_cast(dest).insert(uuid); + } return true; } }; @@ -853,6 +887,7 @@ struct JSONExtractTree assert_cast &>(dest).insert(result); return true; } + private: DataTypePtr data_type; }; @@ -871,13 +906,18 @@ struct JSONExtractTree public: bool insertResultToColumn(IColumn & dest, const Element & element) override { - if (!element.isString()) + if (element.isNull()) return false; - auto & col_str = assert_cast(dest); + + if (!element.isString()) + return JSONExtractRawImpl::insertResultToFixedStringColumn(dest, element, {}); + auto str = element.getString(); + auto & col_str = assert_cast(dest); if (str.size() > col_str.getN()) return false; col_str.insertData(str.data(), str.size()); + return true; } }; @@ -1099,9 +1139,19 @@ struct JSONExtractTree case TypeIndex::UUID: return std::make_unique(); case TypeIndex::LowCardinality: { + // The low cardinality case is treated in two different ways: + // For FixedString type, an especial class is implemented for inserting the data in the destination column, + // as the string length must be passed in order to check and pad the incoming data. + // For the rest of low cardinality types, the insertion is done in their corresponding class, adapting the data + // as needed for the insertData function of the ColumnLowCardinality. auto dictionary_type = typeid_cast(type.get())->getDictionaryType(); + if ((*dictionary_type).getTypeId() == TypeIndex::FixedString) + { + auto fixed_length = typeid_cast(dictionary_type.get())->getN(); + return std::make_unique(fixed_length); + } auto impl = build(function_name, dictionary_type); - return std::make_unique(dictionary_type, std::move(impl)); + return impl; } case TypeIndex::Decimal256: return std::make_unique>(type); case TypeIndex::Decimal128: return std::make_unique>(type); diff --git a/tests/performance/low_cardinality_from_json.xml b/tests/performance/low_cardinality_from_json.xml new file mode 100644 index 00000000000..ac6542ac503 --- /dev/null +++ b/tests/performance/low_cardinality_from_json.xml @@ -0,0 +1,73 @@ + + + + + string_json + + '{"a": "hi", "b": "hello", "c": "hola", "d": "see you, bye, bye"}' + + + + int_json + + '{"a": 11, "b": 2222, "c": 33333333, "d": 4444444444444444}' + + + + uuid_json + + '{"a": "2d49dc6e-ddce-4cd0-afb8-790956df54c4", "b": "2d49dc6e-ddce-4cd0-afb8-790956df54c3", "c": "2d49dc6e-ddce-4cd0-afb8-790956df54c1", "d": "2d49dc6e-ddce-4cd0-afb8-790956df54c1"}' + + + + low_cardinality_tuple_string + + 'Tuple(a LowCardinality(String), b LowCardinality(String), c LowCardinality(String), d LowCardinality(String) )' + + + + low_cardinality_tuple_fixed_string + + 'Tuple(a LowCardinality(FixedString(20)), b LowCardinality(FixedString(20)), c LowCardinality(FixedString(20)), d LowCardinality(FixedString(20)) )' + + + + low_cardinality_tuple_int8 + + 'Tuple(a LowCardinality(Int8), b LowCardinality(Int8), c LowCardinality(Int8), d LowCardinality(Int8) )' + + + + low_cardinality_tuple_int16 + + 'Tuple(a LowCardinality(Int16), b LowCardinality(Int16), c LowCardinality(Int16), d LowCardinality(Int16) )' + + + + low_cardinality_tuple_int32 + + 'Tuple(a LowCardinality(Int32), b LowCardinality(Int32), c LowCardinality(Int32), d LowCardinality(Int32) )' + + + + low_cardinality_tuple_int64 + + 'Tuple(a LowCardinality(Int64), b LowCardinality(Int64), c LowCardinality(Int64), d LowCardinality(Int64) )' + + + + low_cardinality_tuple_uuid + + 'Tuple(a LowCardinality(UUID), b LowCardinality(UUID), c LowCardinality(UUID), d LowCardinality(UUID) )' + + + + + SELECT 'fixed_string_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({string_json}), {low_cardinality_tuple_fixed_string})) FORMAT Null + SELECT 'string_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({string_json}), {low_cardinality_tuple_string})) FORMAT Null + SELECT 'int8_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({int_json}), {low_cardinality_tuple_int8})) FORMAT Null + SELECT 'int16_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({int_json}), {low_cardinality_tuple_int16})) FORMAT Null + SELECT 'int32_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({int_json}), {low_cardinality_tuple_int32})) FORMAT Null + SELECT 'int64_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({int_json}), {low_cardinality_tuple_int64})) FORMAT Null + SELECT 'uuid_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({uuid_json}), {low_cardinality_tuple_uuid})) FORMAT Null + \ No newline at end of file diff --git a/tests/queries/0_stateless/02452_check_low_cardinality.reference b/tests/queries/0_stateless/02452_check_low_cardinality.reference new file mode 100644 index 00000000000..700778e02c7 --- /dev/null +++ b/tests/queries/0_stateless/02452_check_low_cardinality.reference @@ -0,0 +1,7 @@ +('hi','hello','hola','see you, bye, bye') +('hi\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0','hello\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0','hola\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0','see you, bye, bye\0\0\0') +(11,0,0,0) +(11,2222,0,0) +(11,2222,33333333,0) +(11,2222,33333333,4444444444444444) +('2d49dc6e-ddce-4cd0-afb8-790956df54c4','2d49dc6e-ddce-4cd0-afb8-790956df54c3','2d49dc6e-ddce-4cd0-afb8-790956df54c1','2d49dc6e-ddce-4cd0-afb8-790956df54c1') diff --git a/tests/queries/0_stateless/02452_check_low_cardinality.sql b/tests/queries/0_stateless/02452_check_low_cardinality.sql new file mode 100644 index 00000000000..166be281405 --- /dev/null +++ b/tests/queries/0_stateless/02452_check_low_cardinality.sql @@ -0,0 +1,54 @@ +DROP TABLE IF EXISTS test_low_cardinality_string; +DROP TABLE IF EXISTS test_low_cardinality_uuid; +DROP TABLE IF EXISTS test_low_cardinality_int; +CREATE TABLE test_low_cardinality_string (data String) ENGINE MergeTree ORDER BY data; +CREATE TABLE test_low_cardinality_uuid (data String) ENGINE MergeTree ORDER BY data; +CREATE TABLE test_low_cardinality_int (data String) ENGINE MergeTree ORDER BY data; +INSERT INTO test_low_cardinality_string (data) VALUES ('{"a": "hi", "b": "hello", "c": "hola", "d": "see you, bye, bye"}'); +INSERT INTO test_low_cardinality_int (data) VALUES ('{"a": 11, "b": 2222, "c": 33333333, "d": 4444444444444444}'); +INSERT INTO test_low_cardinality_uuid (data) VALUES ('{"a": "2d49dc6e-ddce-4cd0-afb8-790956df54c4", "b": "2d49dc6e-ddce-4cd0-afb8-790956df54c3", "c": "2d49dc6e-ddce-4cd0-afb8-790956df54c1", "d": "2d49dc6e-ddce-4cd0-afb8-790956df54c1"}'); +SELECT JSONExtract(data, 'Tuple( + a LowCardinality(String), + b LowCardinality(String), + c LowCardinality(String), + d LowCardinality(String) + )') AS json FROM test_low_cardinality_string; +SELECT JSONExtract(data, 'Tuple( + a LowCardinality(FixedString(20)), + b LowCardinality(FixedString(20)), + c LowCardinality(FixedString(20)), + d LowCardinality(FixedString(20)) + )') AS json FROM test_low_cardinality_string; +SELECT JSONExtract(data, 'Tuple( + a LowCardinality(Int8), + b LowCardinality(Int8), + c LowCardinality(Int8), + d LowCardinality(Int8) + )') AS json FROM test_low_cardinality_int; +SELECT JSONExtract(data, 'Tuple( + a LowCardinality(Int16), + b LowCardinality(Int16), + c LowCardinality(Int16), + d LowCardinality(Int16) + )') AS json FROM test_low_cardinality_int; +SELECT JSONExtract(data, 'Tuple( + a LowCardinality(Int32), + b LowCardinality(Int32), + c LowCardinality(Int32), + d LowCardinality(Int32) + )') AS json FROM test_low_cardinality_int; +SELECT JSONExtract(data, 'Tuple( + a LowCardinality(Int64), + b LowCardinality(Int64), + c LowCardinality(Int64), + d LowCardinality(Int64) + )') AS json FROM test_low_cardinality_int; +SELECT JSONExtract(data, 'Tuple( + a LowCardinality(UUID), + b LowCardinality(UUID), + c LowCardinality(UUID), + d LowCardinality(UUID) + )') AS json FROM test_low_cardinality_uuid; +DROP TABLE test_low_cardinality_string; +DROP TABLE test_low_cardinality_uuid; +DROP TABLE test_low_cardinality_int; diff --git a/tests/queries/0_stateless/02455_extract_fixed_string_from_nested_json.reference b/tests/queries/0_stateless/02455_extract_fixed_string_from_nested_json.reference new file mode 100644 index 00000000000..3a528a24821 --- /dev/null +++ b/tests/queries/0_stateless/02455_extract_fixed_string_from_nested_json.reference @@ -0,0 +1 @@ +('{"b":{"c":1,"d":"str"}}\0') diff --git a/tests/queries/0_stateless/02455_extract_fixed_string_from_nested_json.sql b/tests/queries/0_stateless/02455_extract_fixed_string_from_nested_json.sql new file mode 100644 index 00000000000..449713d396f --- /dev/null +++ b/tests/queries/0_stateless/02455_extract_fixed_string_from_nested_json.sql @@ -0,0 +1,6 @@ +-- Tags: no-fasttest +DROP TABLE IF EXISTS test_fixed_string_nested_json; +CREATE TABLE test_fixed_string_nested_json (data String) ENGINE MergeTree ORDER BY data; +INSERT INTO test_fixed_string_nested_json (data) VALUES ('{"a" : {"b" : {"c" : 1, "d" : "str"}}}'); +SELECT JSONExtract(data, 'Tuple(a FixedString(24))') AS json FROM test_fixed_string_nested_json; +DROP TABLE test_fixed_string_nested_json; \ No newline at end of file From 9f61ed4d50f911d31081f86af6bfcc0db2af8e3b Mon Sep 17 00:00:00 2001 From: Alfonso Martinez Date: Mon, 31 Oct 2022 13:23:27 +0100 Subject: [PATCH 04/78] Added no-fasttest tag for JSON tests --- tests/queries/0_stateless/02452_check_low_cardinality.sql | 1 + .../queries/0_stateless/02474_extract_fixedstring_from_json.sql | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02452_check_low_cardinality.sql b/tests/queries/0_stateless/02452_check_low_cardinality.sql index 166be281405..e9cb8c800c7 100644 --- a/tests/queries/0_stateless/02452_check_low_cardinality.sql +++ b/tests/queries/0_stateless/02452_check_low_cardinality.sql @@ -1,3 +1,4 @@ +-- Tags: no-fasttest DROP TABLE IF EXISTS test_low_cardinality_string; DROP TABLE IF EXISTS test_low_cardinality_uuid; DROP TABLE IF EXISTS test_low_cardinality_int; diff --git a/tests/queries/0_stateless/02474_extract_fixedstring_from_json.sql b/tests/queries/0_stateless/02474_extract_fixedstring_from_json.sql index cfc47e00cba..336dda411da 100644 --- a/tests/queries/0_stateless/02474_extract_fixedstring_from_json.sql +++ b/tests/queries/0_stateless/02474_extract_fixedstring_from_json.sql @@ -1,3 +1,4 @@ +-- Tags: no-fasttest SELECT JSONExtract('{"a": 123456}', 'FixedString(11)'); SELECT JSONExtract('{"a": 123456}', 'FixedString(12)'); SELECT JSONExtract('{"a": "123456"}', 'a', 'FixedString(5)'); From cf05ac84dc74adf3daa4370625d3082ce6a6ff91 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 4 Nov 2022 11:12:45 +0100 Subject: [PATCH 05/78] Add no-fasttest tag --- .../0_stateless/02473_extract_low_cardinality_from_json.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql b/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql index 0a0ae398227..664c52e772f 100644 --- a/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql +++ b/tests/queries/0_stateless/02473_extract_low_cardinality_from_json.sql @@ -1,2 +1,3 @@ +-- Tags: no-fasttest SELECT JSONExtract('{"a" : {"b" : {"c" : 1, "d" : "str"}}}', 'Tuple( a LowCardinality(String), b LowCardinality(String), c LowCardinality(String), d LowCardinality(String))'); -SELECT JSONExtract('{"a" : {"b" : {"c" : 1, "d" : "str"}}}', 'Tuple( a String, b LowCardinality(String), c LowCardinality(String), d LowCardinality(String))'); \ No newline at end of file +SELECT JSONExtract('{"a" : {"b" : {"c" : 1, "d" : "str"}}}', 'Tuple( a String, b LowCardinality(String), c LowCardinality(String), d LowCardinality(String))'); From 2daec0b45e781366178a857319990898dd8ef14e Mon Sep 17 00:00:00 2001 From: serxa Date: Mon, 7 Nov 2022 18:05:40 +0000 Subject: [PATCH 06/78] S3 request per second rate throttling + refactoring --- src/Backups/BackupIO_S3.cpp | 9 ++-- src/Backups/BackupIO_S3.h | 3 +- src/Common/ProfileEvents.cpp | 2 +- src/Common/Throttler.cpp | 2 - src/Common/Throttler.h | 2 + src/Coordination/KeeperSnapshotManagerS3.cpp | 6 ++- src/Core/Settings.h | 4 ++ .../ObjectStorages/S3/S3ObjectStorage.cpp | 15 ++++--- src/Disks/ObjectStorages/S3/diskSettings.cpp | 34 +++++++++++---- src/Disks/ObjectStorages/S3/diskSettings.h | 2 +- src/Disks/ObjectStorages/S3/parseConfig.h | 27 ------------ .../ObjectStorages/S3/registerDiskS3.cpp | 13 ++---- src/IO/ReadBufferFromS3.cpp | 10 ++--- src/IO/ReadBufferFromS3.h | 11 ++--- src/IO/S3/PocoHTTPClient.cpp | 26 +++++++++++- src/IO/S3/PocoHTTPClient.h | 17 +++++++- src/IO/S3/tests/gtest_aws_s3_client.cpp | 8 +++- src/IO/S3Common.cpp | 31 ++++++++++++-- src/IO/S3Common.h | 5 ++- src/Storages/StorageS3.cpp | 42 ++++++++++--------- src/Storages/StorageS3.h | 7 ++-- src/Storages/StorageS3Settings.cpp | 21 ++++++++++ src/Storages/StorageS3Settings.h | 7 +++- 23 files changed, 198 insertions(+), 106 deletions(-) delete mode 100644 src/Disks/ObjectStorages/S3/parseConfig.h diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 12038a8a30c..3f723c98a2a 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -46,7 +46,7 @@ namespace context->getRemoteHostFilter(), static_cast(context->getGlobalContext()->getSettingsRef().s3_max_redirects), context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging, - /* for_disk_s3 = */ false); + /* for_disk_s3 = */ false, /* get_request_throttler = */ {}, /* put_request_throttler = */ {}); client_configuration.endpointOverride = s3_uri.endpoint; client_configuration.maxConnections = static_cast(context->getSettingsRef().s3_max_connections); @@ -86,8 +86,8 @@ BackupReaderS3::BackupReaderS3( const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_) : s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) - , max_single_read_retries(context_->getSettingsRef().s3_max_single_read_retries) , read_settings(context_->getReadSettings()) + , rw_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).rw_settings) { } @@ -115,7 +115,7 @@ UInt64 BackupReaderS3::getFileSize(const String & file_name) std::unique_ptr BackupReaderS3::readFile(const String & file_name) { return std::make_unique( - client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, max_single_read_retries, read_settings); + client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, rw_settings, read_settings); } @@ -123,7 +123,6 @@ BackupWriterS3::BackupWriterS3( const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_) : s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) - , max_single_read_retries(context_->getSettingsRef().s3_max_single_read_retries) , read_settings(context_->getReadSettings()) , rw_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).rw_settings) { @@ -318,7 +317,7 @@ bool BackupWriterS3::fileContentsEqual(const String & file_name, const String & try { auto in = std::make_unique( - client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, max_single_read_retries, read_settings); + client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, rw_settings, read_settings); String actual_file_contents(expected_file_contents.size(), ' '); return (in->read(actual_file_contents.data(), actual_file_contents.size()) == actual_file_contents.size()) && (actual_file_contents == expected_file_contents) && in->eof(); diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index 471ddcc06e6..17a365da396 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -39,8 +39,8 @@ public: private: S3::URI s3_uri; std::shared_ptr client; - UInt64 max_single_read_retries; ReadSettings read_settings; + S3Settings::ReadWriteSettings rw_settings; }; @@ -82,7 +82,6 @@ private: S3::URI s3_uri; std::shared_ptr client; - UInt64 max_single_read_retries; ReadSettings read_settings; S3Settings::ReadWriteSettings rw_settings; }; diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 2f801e496fa..a3ceecf8041 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -61,7 +61,7 @@ M(NetworkSendElapsedMicroseconds, "Total time spent waiting for data to send to network or sending data to network. Only ClickHouse-related network interaction is included, not by 3rd party libraries..") \ M(NetworkReceiveBytes, "Total number of bytes received from network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \ M(NetworkSendBytes, "Total number of bytes send to network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \ - M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform the 'max_network_bandwidth' setting.") \ + M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_network_bandwidth' and other throttling settings.") \ \ M(QueryMaskingRulesMatch, "Number of times query masking rules was successfully matched.") \ \ diff --git a/src/Common/Throttler.cpp b/src/Common/Throttler.cpp index b38777efc03..79625d4894c 100644 --- a/src/Common/Throttler.cpp +++ b/src/Common/Throttler.cpp @@ -20,8 +20,6 @@ namespace ErrorCodes /// Just 10^9. static constexpr auto NS = 1000000000UL; -static const size_t default_burst_seconds = 1; - Throttler::Throttler(size_t max_speed_, const std::shared_ptr & parent_) : max_speed(max_speed_) , max_burst(max_speed_ * default_burst_seconds) diff --git a/src/Common/Throttler.h b/src/Common/Throttler.h index 9b6eff13506..708e9b939fa 100644 --- a/src/Common/Throttler.h +++ b/src/Common/Throttler.h @@ -17,6 +17,8 @@ namespace DB class Throttler { public: + static const size_t default_burst_seconds = 1; + Throttler(size_t max_speed_, size_t max_burst_, const std::shared_ptr & parent_ = nullptr) : max_speed(max_speed_), max_burst(max_burst_), limit_exceeded_exception_message(""), tokens(max_burst), parent(parent_) {} diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index 2e19d496407..8777ef341ff 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -93,7 +93,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo auth_settings.region, RemoteHostFilter(), s3_max_redirects, enable_s3_requests_logging, - /* for_disk_s3 = */ false); + /* for_disk_s3 = */ false, /* get_request_throttler = */ {}, /* put_request_throttler = */ {}); client_configuration.endpointOverride = new_uri.endpoint; @@ -194,13 +194,15 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa lock_writer.finalize(); // We read back the written UUID, if it's the same we can upload the file + S3Settings::ReadWriteSettings rw_settings; + rw_settings.max_single_read_retries = 1; ReadBufferFromS3 lock_reader { s3_client->client, s3_client->uri.bucket, lock_file, "", - 1, + rw_settings, {} }; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 5c6ca1a1d57..e36ff1985fb 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -90,6 +90,10 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ + M(UInt64, s3_max_get_rps, 0, "Limit on S3 GET request per second rate. Zero means unlimited.", 0) \ + M(UInt64, s3_max_get_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_get_rps`", 0) \ + M(UInt64, s3_max_put_rps, 0, "Limit on S3 PUT request per second rate. Zero means unlimited.", 0) \ + M(UInt64, s3_max_put_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_put_rps`", 0) \ M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \ M(Bool, s3_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in s3 engine tables", 0) \ M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \ diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 0c421ee03d7..21b7ff4f0f1 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -175,7 +175,7 @@ std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT bucket, path, version_id, - settings_ptr->s3_settings.max_single_read_retries, + settings_ptr->s3_settings, disk_read_settings, /* use_external_buffer */true, /* offset */0, @@ -212,7 +212,7 @@ std::unique_ptr S3ObjectStorage::readObject( /// NOLINT bucket, object.absolute_path, version_id, - settings_ptr->s3_settings.max_single_read_retries, + settings_ptr->s3_settings, patchSettings(read_settings)); } @@ -627,17 +627,20 @@ void S3ObjectStorage::startup() void S3ObjectStorage::applyNewSettings(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, ContextPtr context) { - s3_settings.set(getSettings(config, config_prefix, context)); - client.set(getClient(config, config_prefix, context)); + auto new_s3_settings = getSettings(config, config_prefix, context); + auto new_client = getClient(config, config_prefix, context, *new_s3_settings); + s3_settings.set(std::move(new_s3_settings)); + client.set(std::move(new_client)); applyRemoteThrottlingSettings(context); } std::unique_ptr S3ObjectStorage::cloneObjectStorage( const std::string & new_namespace, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, ContextPtr context) { + auto new_s3_settings = getSettings(config, config_prefix, context); + auto new_client = getClient(config, config_prefix, context, *new_s3_settings); return std::make_unique( - getClient(config, config_prefix, context), - getSettings(config, config_prefix, context), + std::move(new_client), std::move(new_s3_settings), version_id, s3_capabilities, new_namespace, S3::URI(Poco::URI(config.getString(config_prefix + ".endpoint"))).endpoint); } diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index e61987163d2..707675c01fd 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -32,14 +33,23 @@ namespace ErrorCodes std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { + const Settings & settings = context->getSettingsRef(); S3Settings::ReadWriteSettings rw_settings; - rw_settings.max_single_read_retries = config.getUInt64(config_prefix + ".s3_max_single_read_retries", context->getSettingsRef().s3_max_single_read_retries); - rw_settings.min_upload_part_size = config.getUInt64(config_prefix + ".s3_min_upload_part_size", context->getSettingsRef().s3_min_upload_part_size); - rw_settings.upload_part_size_multiply_factor = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_factor", context->getSettingsRef().s3_upload_part_size_multiply_factor); - rw_settings.upload_part_size_multiply_parts_count_threshold = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_parts_count_threshold", context->getSettingsRef().s3_upload_part_size_multiply_parts_count_threshold); - rw_settings.max_single_part_upload_size = config.getUInt64(config_prefix + ".s3_max_single_part_upload_size", context->getSettingsRef().s3_max_single_part_upload_size); - rw_settings.check_objects_after_upload = config.getUInt64(config_prefix + ".s3_check_objects_after_upload", context->getSettingsRef().s3_check_objects_after_upload); - rw_settings.max_unexpected_write_error_retries = config.getUInt64(config_prefix + ".s3_max_unexpected_write_error_retries", context->getSettingsRef().s3_max_unexpected_write_error_retries); + rw_settings.max_single_read_retries = config.getUInt64(config_prefix + ".s3_max_single_read_retries", settings.s3_max_single_read_retries); + rw_settings.min_upload_part_size = config.getUInt64(config_prefix + ".s3_min_upload_part_size", settings.s3_min_upload_part_size); + rw_settings.upload_part_size_multiply_factor = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_factor", settings.s3_upload_part_size_multiply_factor); + rw_settings.upload_part_size_multiply_parts_count_threshold = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_parts_count_threshold", settings.s3_upload_part_size_multiply_parts_count_threshold); + rw_settings.max_single_part_upload_size = config.getUInt64(config_prefix + ".s3_max_single_part_upload_size", settings.s3_max_single_part_upload_size); + rw_settings.check_objects_after_upload = config.getUInt64(config_prefix + ".s3_check_objects_after_upload", settings.s3_check_objects_after_upload); + rw_settings.max_unexpected_write_error_retries = config.getUInt64(config_prefix + ".s3_max_unexpected_write_error_retries", settings.s3_max_unexpected_write_error_retries); + + // NOTE: it would be better to reuse old throttlers to avoid losing token bucket state on every config reload, which could lead to exceeding limit for short time. But it is good enough unless very high `burst` values are used. + if (UInt64 max_get_rps = config.getUInt64(config_prefix + ".s3_max_get_rps", settings.s3_max_get_rps)) + rw_settings.get_request_throttler = std::make_shared( + max_get_rps, config.getUInt64(config_prefix + ".s3_max_get_burst", settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * max_get_rps)); + if (UInt64 max_put_rps = config.getUInt64(config_prefix + ".s3_max_put_rps", settings.s3_max_put_rps)) + rw_settings.put_request_throttler = std::make_shared( + max_put_rps, config.getUInt64(config_prefix + ".s3_max_put_burst", settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * max_put_rps)); return std::make_unique( rw_settings, @@ -112,14 +122,20 @@ std::shared_ptr getProxyConfiguration(const String & pre } -std::unique_ptr getClient(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) +std::unique_ptr getClient( + const Poco::Util::AbstractConfiguration & config, + const String & config_prefix, + ContextPtr context, + const S3ObjectStorageSettings & settings) { S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( config.getString(config_prefix + ".region", ""), context->getRemoteHostFilter(), static_cast(context->getGlobalContext()->getSettingsRef().s3_max_redirects), context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging, - /* for_disk_s3 = */ true); + /* for_disk_s3 = */ true, + settings.s3_settings.get_request_throttler, + settings.s3_settings.put_request_throttler); S3::URI uri(Poco::URI(config.getString(config_prefix + ".endpoint"))); if (uri.key.back() != '/') diff --git a/src/Disks/ObjectStorages/S3/diskSettings.h b/src/Disks/ObjectStorages/S3/diskSettings.h index 05ba8819f83..04eb7aced8e 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.h +++ b/src/Disks/ObjectStorages/S3/diskSettings.h @@ -22,7 +22,7 @@ struct S3ObjectStorageSettings; std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context); -std::unique_ptr getClient(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context); +std::unique_ptr getClient(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, const S3ObjectStorageSettings & settings); } diff --git a/src/Disks/ObjectStorages/S3/parseConfig.h b/src/Disks/ObjectStorages/S3/parseConfig.h deleted file mode 100644 index 1defc673c2e..00000000000 --- a/src/Disks/ObjectStorages/S3/parseConfig.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AWS_S3 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - - -std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context); - -std::shared_ptr getClient(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context); - - -} diff --git a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp index e09aef22122..e73accbb956 100644 --- a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp +++ b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp @@ -130,21 +130,16 @@ void registerDiskS3(DiskFactory & factory) chassert(type == "s3" || type == "s3_plain"); MetadataStoragePtr metadata_storage; + auto settings = getSettings(config, config_prefix, context); + auto client = getClient(config, config_prefix, context, *settings); if (type == "s3_plain") { - s3_storage = std::make_shared( - getClient(config, config_prefix, context), - getSettings(config, config_prefix, context), - uri.version_id, s3_capabilities, uri.bucket, uri.endpoint); + s3_storage = std::make_shared(std::move(client), std::move(settings), uri.version_id, s3_capabilities, uri.bucket, uri.endpoint); metadata_storage = std::make_shared(s3_storage, uri.key); } else { - s3_storage = std::make_shared( - getClient(config, config_prefix, context), - getSettings(config, config_prefix, context), - uri.version_id, s3_capabilities, uri.bucket, uri.endpoint); - + s3_storage = std::make_shared(std::move(client), std::move(settings), uri.version_id, s3_capabilities, uri.bucket, uri.endpoint); auto [metadata_path, metadata_disk] = prepareForLocalMetadata(name, config, config_prefix, context); metadata_storage = std::make_shared(metadata_disk, uri.key); } diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index c49941b025d..fa748469ca0 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -45,7 +45,7 @@ ReadBufferFromS3::ReadBufferFromS3( const String & bucket_, const String & key_, const String & version_id_, - UInt64 max_single_read_retries_, + const S3Settings::ReadWriteSettings & s3_settings_, const ReadSettings & settings_, bool use_external_buffer_, size_t offset_, @@ -56,7 +56,7 @@ ReadBufferFromS3::ReadBufferFromS3( , bucket(bucket_) , key(key_) , version_id(version_id_) - , max_single_read_retries(max_single_read_retries_) + , s3_settings(s3_settings_) , offset(offset_) , read_until_position(read_until_position_) , read_settings(settings_) @@ -105,7 +105,7 @@ bool ReadBufferFromS3::nextImpl() } size_t sleep_time_with_backoff_milliseconds = 100; - for (size_t attempt = 0; (attempt < max_single_read_retries) && !next_result; ++attempt) + for (size_t attempt = 0; attempt < s3_settings.max_single_read_retries && !next_result; ++attempt) { Stopwatch watch; try @@ -166,7 +166,7 @@ bool ReadBufferFromS3::nextImpl() attempt, e.message()); - if (attempt + 1 == max_single_read_retries) + if (attempt + 1 == s3_settings.max_single_read_retries) throw; /// Pause before next attempt. @@ -349,7 +349,7 @@ SeekableReadBufferPtr ReadBufferS3Factory::getReader() bucket, key, version_id, - s3_max_single_read_retries, + s3_settings, read_settings, false /*use_external_buffer*/, next_range->first, diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index cc836bba495..3e1a26f7713 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "config.h" #if USE_AWS_S3 @@ -33,7 +34,7 @@ private: String bucket; String key; String version_id; - UInt64 max_single_read_retries; + const S3Settings::ReadWriteSettings s3_settings; /// These variables are atomic because they can be used for `logging only` /// (where it is not important to get consistent result) @@ -52,7 +53,7 @@ public: const String & bucket_, const String & key_, const String & version_id_, - UInt64 max_single_read_retries_, + const S3Settings::ReadWriteSettings & s3_settings_, const ReadSettings & settings_, bool use_external_buffer = false, size_t offset_ = 0, @@ -100,7 +101,7 @@ public: const String & version_id_, size_t range_step_, size_t object_size_, - UInt64 s3_max_single_read_retries_, + const S3Settings::ReadWriteSettings & s3_settings_, const ReadSettings & read_settings_) : client_ptr(client_ptr_) , bucket(bucket_) @@ -110,7 +111,7 @@ public: , range_generator(object_size_, range_step_) , range_step(range_step_) , object_size(object_size_) - , s3_max_single_read_retries(s3_max_single_read_retries_) + , s3_settings(s3_settings_) { assert(range_step > 0); assert(range_step < object_size); @@ -135,7 +136,7 @@ private: size_t range_step; size_t object_size; - UInt64 s3_max_single_read_retries; + const S3Settings::ReadWriteSettings s3_settings; }; } diff --git a/src/IO/S3/PocoHTTPClient.cpp b/src/IO/S3/PocoHTTPClient.cpp index 7d053bebe4a..33dd3250c9f 100644 --- a/src/IO/S3/PocoHTTPClient.cpp +++ b/src/IO/S3/PocoHTTPClient.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -76,12 +77,16 @@ PocoHTTPClientConfiguration::PocoHTTPClientConfiguration( const RemoteHostFilter & remote_host_filter_, unsigned int s3_max_redirects_, bool enable_s3_requests_logging_, - bool for_disk_s3_) + bool for_disk_s3_, + const ThrottlerPtr & get_request_throttler_, + const ThrottlerPtr & put_request_throttler_) : force_region(force_region_) , remote_host_filter(remote_host_filter_) , s3_max_redirects(s3_max_redirects_) , enable_s3_requests_logging(enable_s3_requests_logging_) , for_disk_s3(for_disk_s3_) + , get_request_throttler(get_request_throttler_) + , put_request_throttler(put_request_throttler_) { } @@ -128,6 +133,8 @@ PocoHTTPClient::PocoHTTPClient(const PocoHTTPClientConfiguration & client_config , s3_max_redirects(client_configuration.s3_max_redirects) , enable_s3_requests_logging(client_configuration.enable_s3_requests_logging) , for_disk_s3(client_configuration.for_disk_s3) + , get_request_throttler(client_configuration.get_request_throttler) + , put_request_throttler(client_configuration.put_request_throttler) , extra_headers(client_configuration.extra_headers) { } @@ -245,6 +252,23 @@ void PocoHTTPClient::makeRequestInternal( if (enable_s3_requests_logging) LOG_TEST(log, "Make request to: {}", uri); + switch (request.GetMethod()) + { + case Aws::Http::HttpMethod::HTTP_GET: + case Aws::Http::HttpMethod::HTTP_HEAD: + if (get_request_throttler) + get_request_throttler->add(1); + break; + case Aws::Http::HttpMethod::HTTP_PUT: + case Aws::Http::HttpMethod::HTTP_POST: + case Aws::Http::HttpMethod::HTTP_PATCH: + if (put_request_throttler) + put_request_throttler->add(1); + break; + case Aws::Http::HttpMethod::HTTP_DELETE: + break; // Not throttled + } + addMetric(request, S3MetricType::Count); CurrentMetrics::Increment metric_increment{CurrentMetrics::S3Requests}; diff --git a/src/IO/S3/PocoHTTPClient.h b/src/IO/S3/PocoHTTPClient.h index 5649638285d..ed6e1793c2c 100644 --- a/src/IO/S3/PocoHTTPClient.h +++ b/src/IO/S3/PocoHTTPClient.h @@ -8,6 +8,7 @@ #if USE_AWS_S3 #include +#include #include #include #include @@ -48,6 +49,8 @@ struct PocoHTTPClientConfiguration : public Aws::Client::ClientConfiguration unsigned int s3_max_redirects; bool enable_s3_requests_logging; bool for_disk_s3; + ThrottlerPtr get_request_throttler; + ThrottlerPtr put_request_throttler; HeaderCollection extra_headers; void updateSchemeAndRegion(); @@ -60,7 +63,9 @@ private: const RemoteHostFilter & remote_host_filter_, unsigned int s3_max_redirects_, bool enable_s3_requests_logging_, - bool for_disk_s3_ + bool for_disk_s3_, + const ThrottlerPtr & get_request_throttler_, + const ThrottlerPtr & put_request_throttler_ ); /// Constructor of Aws::Client::ClientConfiguration must be called after AWS SDK initialization. @@ -154,6 +159,16 @@ private: unsigned int s3_max_redirects; bool enable_s3_requests_logging; bool for_disk_s3; + + /// Limits get request per second rate for GET, SELECT and all other requests, excluding throttled by put throttler + /// (i.e. throttles GetObject, HeadObject) + ThrottlerPtr get_request_throttler; + + /// Limits put request per second rate for PUT, COPY, POST, LIST requests + /// (i.e. throttles PutObject, CopyObject, ListObjects, CreateMultipartUpload, UploadPartCopy, UploadPart, CompleteMultipartUpload) + /// NOTE: DELETE and CANCEL requests are not throttled by either put or get throttler + ThrottlerPtr put_request_throttler; + const HeaderCollection extra_headers; }; diff --git a/src/IO/S3/tests/gtest_aws_s3_client.cpp b/src/IO/S3/tests/gtest_aws_s3_client.cpp index 9b2a65d84fc..96a7c8b3dfb 100644 --- a/src/IO/S3/tests/gtest_aws_s3_client.cpp +++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp @@ -88,7 +88,9 @@ TEST(IOTestAwsS3Client, AppendExtraSSECHeaders) remote_host_filter, s3_max_redirects, enable_s3_requests_logging, - /* for_disk_s3 = */ false + /* for_disk_s3 = */ false, + /* get_request_throttler = */ {}, + /* put_request_throttler = */ {} ); client_configuration.endpointOverride = uri.endpoint; @@ -113,12 +115,14 @@ TEST(IOTestAwsS3Client, AppendExtraSSECHeaders) ASSERT_TRUE(client); DB::ReadSettings read_settings; + DB::S3Settings::ReadWriteSettings rw_settings; + rw_settings.max_single_read_retries = max_single_read_retries; DB::ReadBufferFromS3 read_buffer( client, uri.bucket, uri.key, version_id, - max_single_read_retries, + rw_settings, read_settings ); diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 859f5ce796b..5af09275df4 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -573,7 +573,14 @@ public: /// AWS API tries credentials providers one by one. Some of providers (like ProfileConfigFileAWSCredentialsProvider) can be /// quite verbose even if nobody configured them. So we use our provider first and only after it use default providers. { - DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging, configuration.for_disk_s3); + DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration( + configuration.region, + configuration.remote_host_filter, + configuration.s3_max_redirects, + configuration.enable_s3_requests_logging, + configuration.for_disk_s3, + configuration.get_request_throttler, + configuration.put_request_throttler); AddProvider(std::make_shared(aws_client_configuration)); } @@ -610,7 +617,14 @@ public: } else if (Aws::Utils::StringUtils::ToLower(ec2_metadata_disabled.c_str()) != "true") { - DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging, configuration.for_disk_s3); + DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration( + configuration.region, + configuration.remote_host_filter, + configuration.s3_max_redirects, + configuration.enable_s3_requests_logging, + configuration.for_disk_s3, + configuration.get_request_throttler, + configuration.put_request_throttler); /// See MakeDefaultHttpResourceClientConfiguration(). /// This is part of EC2 metadata client, but unfortunately it can't be accessed from outside @@ -731,9 +745,18 @@ namespace S3 const RemoteHostFilter & remote_host_filter, unsigned int s3_max_redirects, bool enable_s3_requests_logging, - bool for_disk_s3) + bool for_disk_s3, + const ThrottlerPtr & get_request_throttler, + const ThrottlerPtr & put_request_throttler) { - return PocoHTTPClientConfiguration(force_region, remote_host_filter, s3_max_redirects, enable_s3_requests_logging, for_disk_s3); + return PocoHTTPClientConfiguration( + force_region, + remote_host_filter, + s3_max_redirects, + enable_s3_requests_logging, + for_disk_s3, + get_request_throttler, + put_request_throttler); } URI::URI(const Poco::URI & uri_) diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 93e5eb78c7f..01a6b8d5d82 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -17,6 +17,7 @@ #include #include +#include namespace Aws::S3 { @@ -88,7 +89,9 @@ public: const RemoteHostFilter & remote_host_filter, unsigned int s3_max_redirects, bool enable_s3_requests_logging, - bool for_disk_s3); + bool for_disk_s3, + const ThrottlerPtr & get_request_throttler, + const ThrottlerPtr & put_request_throttler); private: ClientFactory(); diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 9bbccf5f582..9309e3f7384 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -100,7 +100,8 @@ public: const Block & virtual_header_, ContextPtr context_, std::unordered_map * object_infos_, - Strings * read_keys_) + Strings * read_keys_, + const S3Settings::ReadWriteSettings & rw_settings_) : WithContext(context_) , client(client_) , globbed_uri(globbed_uri_) @@ -108,6 +109,7 @@ public: , virtual_header(virtual_header_) , object_infos(object_infos_) , read_keys(read_keys_) + , rw_settings(rw_settings_) { if (globbed_uri.bucket.find_first_of("*?{") != globbed_uri.bucket.npos) throw Exception("Expression can not have wildcards inside bucket name", ErrorCodes::UNEXPECTED_EXPRESSION); @@ -258,6 +260,7 @@ private: bool is_finished{false}; std::unordered_map * object_infos; Strings * read_keys; + S3Settings::ReadWriteSettings rw_settings; }; StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( @@ -267,8 +270,9 @@ StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( const Block & virtual_header, ContextPtr context, std::unordered_map * object_infos_, - Strings * read_keys_) - : pimpl(std::make_shared(client_, globbed_uri_, query, virtual_header, context, object_infos_, read_keys_)) + Strings * read_keys_, + const S3Settings::ReadWriteSettings & rw_settings_) + : pimpl(std::make_shared(client_, globbed_uri_, query, virtual_header, context, object_infos_, read_keys_, rw_settings_)) { } @@ -381,7 +385,7 @@ StorageS3Source::StorageS3Source( std::optional format_settings_, const ColumnsDescription & columns_, UInt64 max_block_size_, - UInt64 max_single_read_retries_, + const S3Settings::ReadWriteSettings & rw_settings_, String compression_hint_, const std::shared_ptr & client_, const String & bucket_, @@ -397,7 +401,7 @@ StorageS3Source::StorageS3Source( , format(format_) , columns_desc(columns_) , max_block_size(max_block_size_) - , max_single_read_retries(max_single_read_retries_) + , rw_settings(rw_settings_) , compression_hint(std::move(compression_hint_)) , client(client_) , sample_block(sample_block_) @@ -463,7 +467,7 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k if (!use_parallel_download || object_too_small) { LOG_TRACE(log, "Downloading object of size {} from S3 in single thread", object_size); - return std::make_unique(client, bucket, key, version_id, max_single_read_retries, getContext()->getReadSettings()); + return std::make_unique(client, bucket, key, version_id, rw_settings, getContext()->getReadSettings()); } assert(object_size > 0); @@ -475,7 +479,7 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k } auto factory = std::make_unique( - client, bucket, key, version_id, download_buffer_size, object_size, max_single_read_retries, getContext()->getReadSettings()); + client, bucket, key, version_id, download_buffer_size, object_size, rw_settings, getContext()->getReadSettings()); LOG_TRACE( log, "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", download_thread_num, object_size, download_buffer_size); @@ -815,7 +819,7 @@ std::shared_ptr StorageS3::createFileIterator( { /// Iterate through disclosed globs and make a source for each file auto glob_iterator = std::make_shared( - *s3_configuration.client, s3_configuration.uri, query, virtual_block, local_context, object_infos, read_keys); + *s3_configuration.client, s3_configuration.uri, query, virtual_block, local_context, object_infos, read_keys, s3_configuration.rw_settings); return std::make_shared([glob_iterator]() { return glob_iterator->next(); }); } else @@ -905,7 +909,7 @@ Pipe StorageS3::read( format_settings, columns_description, max_block_size, - s3_configuration.rw_settings.max_single_read_retries, + s3_configuration.rw_settings, compression_method, s3_configuration.client, s3_configuration.uri.bucket, @@ -1022,9 +1026,7 @@ void StorageS3::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, void StorageS3::updateS3Configuration(ContextPtr ctx, StorageS3::S3Configuration & upd) { auto settings = ctx->getStorageS3Settings().getSettings(upd.uri.uri.toString()); - const auto & config_rw_settings = settings.rw_settings; - - if (upd.rw_settings != config_rw_settings) + if (upd.rw_settings != settings.rw_settings) upd.rw_settings = settings.rw_settings; upd.rw_settings.updateFromSettingsIfEmpty(ctx->getSettings()); @@ -1045,7 +1047,9 @@ void StorageS3::updateS3Configuration(ContextPtr ctx, StorageS3::S3Configuration ctx->getRemoteHostFilter(), static_cast(ctx->getGlobalContext()->getSettingsRef().s3_max_redirects), ctx->getGlobalContext()->getSettingsRef().enable_s3_requests_logging, - /* for_disk_s3 = */ false); + /* for_disk_s3 = */ false, + upd.rw_settings.get_request_throttler, + upd.rw_settings.put_request_throttler); client_configuration.endpointOverride = upd.uri.endpoint; client_configuration.maxConnections = static_cast(upd.rw_settings.max_connections); @@ -1082,15 +1086,15 @@ void StorageS3::processNamedCollectionResult(StorageS3Configuration & configurat else if (arg_name == "max_single_read_retries") configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "max_single_read_retries"); else if (arg_name == "min_upload_part_size") - configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "min_upload_part_size"); + configuration.rw_settings.min_upload_part_size = checkAndGetLiteralArgument(arg_value, "min_upload_part_size"); else if (arg_name == "upload_part_size_multiply_factor") - configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_factor"); + configuration.rw_settings.upload_part_size_multiply_factor = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_factor"); else if (arg_name == "upload_part_size_multiply_parts_count_threshold") - configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_parts_count_threshold"); + configuration.rw_settings.upload_part_size_multiply_parts_count_threshold = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_parts_count_threshold"); else if (arg_name == "max_single_part_upload_size") - configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "max_single_part_upload_size"); + configuration.rw_settings.max_single_part_upload_size = checkAndGetLiteralArgument(arg_value, "max_single_part_upload_size"); else if (arg_name == "max_connections") - configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "max_connections"); + configuration.rw_settings.max_connections = checkAndGetLiteralArgument(arg_value, "max_connections"); else throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Unknown key-value argument `{}` for StorageS3, expected: url, [access_key_id, secret_access_key], name of used format and [compression_method].", @@ -1220,7 +1224,7 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( int zstd_window_log_max = static_cast(ctx->getSettingsRef().zstd_window_log_max); return wrapReadBufferWithCompressionMethod( std::make_unique( - s3_configuration.client, s3_configuration.uri.bucket, key, s3_configuration.uri.version_id, s3_configuration.rw_settings.max_single_read_retries, ctx->getReadSettings()), + s3_configuration.client, s3_configuration.uri.bucket, key, s3_configuration.uri.version_id, s3_configuration.rw_settings, ctx->getReadSettings()), chooseCompressionMethod(key, compression_method), zstd_window_log_max); }; diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 23947a32092..76391cb2695 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -43,7 +43,8 @@ public: const Block & virtual_header, ContextPtr context, std::unordered_map * object_infos = nullptr, - Strings * read_keys_ = nullptr); + Strings * read_keys_ = nullptr, + const S3Settings::ReadWriteSettings & rw_settings_ = {}); String next(); @@ -79,7 +80,7 @@ public: std::optional format_settings_, const ColumnsDescription & columns_, UInt64 max_block_size_, - UInt64 max_single_read_retries_, + const S3Settings::ReadWriteSettings & rw_settings_, String compression_hint_, const std::shared_ptr & client_, const String & bucket, @@ -102,7 +103,7 @@ private: String format; ColumnsDescription columns_desc; UInt64 max_block_size; - UInt64 max_single_read_retries; + S3Settings::ReadWriteSettings rw_settings; String compression_hint; std::shared_ptr client; Block sample_block; diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index 65e9bb1ab8c..9e670c65e17 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -56,6 +57,14 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U rw_settings.max_connections = get_uint_for_key(key, "max_connections", true, settings.s3_max_connections); rw_settings.check_objects_after_upload = get_bool_for_key(key, "check_objects_after_upload", true, false); + // NOTE: it would be better to reuse old throttlers to avoid losing token bucket state on every config reload, which could lead to exceeding limit for short time. But it is good enough unless very high `burst` values are used. + if (UInt64 max_get_rps = get_uint_for_key(key, "max_get_rps", true, settings.s3_max_get_rps)) + rw_settings.get_request_throttler = std::make_shared( + max_get_rps, get_uint_for_key(key, "max_get_burst", true, settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * max_get_rps)); + if (UInt64 max_put_rps = get_uint_for_key(key, "max_put_rps", true, settings.s3_max_put_rps)) + rw_settings.put_request_throttler = std::make_shared( + max_put_rps, get_uint_for_key(key, "max_put_burst", true, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * max_put_rps)); + s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(rw_settings)}); } } @@ -87,6 +96,12 @@ S3Settings::ReadWriteSettings::ReadWriteSettings(const Settings & settings) max_connections = settings.s3_max_connections; check_objects_after_upload = settings.s3_check_objects_after_upload; max_unexpected_write_error_retries = settings.s3_max_unexpected_write_error_retries; + if (settings.s3_max_get_rps) + get_request_throttler = std::make_shared( + settings.s3_max_get_rps, settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * settings.s3_max_get_rps); + if (settings.s3_max_put_rps) + put_request_throttler = std::make_shared( + settings.s3_max_put_rps, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * settings.s3_max_put_rps); } void S3Settings::ReadWriteSettings::updateFromSettingsIfEmpty(const Settings & settings) @@ -106,6 +121,12 @@ void S3Settings::ReadWriteSettings::updateFromSettingsIfEmpty(const Settings & s if (!max_unexpected_write_error_retries) max_unexpected_write_error_retries = settings.s3_max_unexpected_write_error_retries; check_objects_after_upload = settings.s3_check_objects_after_upload; + if (!get_request_throttler && settings.s3_max_get_rps) + get_request_throttler = std::make_shared( + settings.s3_max_get_rps, settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * settings.s3_max_get_rps); + if (!put_request_throttler && settings.s3_max_put_rps) + put_request_throttler = std::make_shared( + settings.s3_max_put_rps, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * settings.s3_max_put_rps); } } diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h index 2da4a1d7590..40ba11a19ba 100644 --- a/src/Storages/StorageS3Settings.h +++ b/src/Storages/StorageS3Settings.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,8 @@ struct S3Settings size_t max_connections = 0; bool check_objects_after_upload = false; size_t max_unexpected_write_error_retries = 0; + ThrottlerPtr get_request_throttler; + ThrottlerPtr put_request_throttler; ReadWriteSettings() = default; explicit ReadWriteSettings(const Settings & settings); @@ -46,7 +49,9 @@ struct S3Settings && max_single_part_upload_size == other.max_single_part_upload_size && max_connections == other.max_connections && check_objects_after_upload == other.check_objects_after_upload - && max_unexpected_write_error_retries == other.max_unexpected_write_error_retries; + && max_unexpected_write_error_retries == other.max_unexpected_write_error_retries + && get_request_throttler == other.get_request_throttler + && put_request_throttler == other.put_request_throttler; } void updateFromSettingsIfEmpty(const Settings & settings); From 6d5d9ff421cbe7497c6c94f032eb2f9807fe2799 Mon Sep 17 00:00:00 2001 From: serxa Date: Tue, 8 Nov 2022 13:48:23 +0000 Subject: [PATCH 07/78] rename ReadWriteSettings -> RequestSettings --- src/Backups/BackupIO_S3.cpp | 14 ++--- src/Backups/BackupIO_S3.h | 4 +- src/Coordination/KeeperSnapshotManagerS3.cpp | 12 ++--- .../ObjectStorages/S3/S3ObjectStorage.cpp | 12 ++--- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 6 +-- src/Disks/ObjectStorages/S3/diskSettings.cpp | 26 ++++----- src/IO/ReadBufferFromS3.cpp | 10 ++-- src/IO/ReadBufferFromS3.h | 10 ++-- src/IO/S3/tests/gtest_aws_s3_client.cpp | 6 +-- src/IO/WriteBufferFromS3.cpp | 18 +++---- src/IO/WriteBufferFromS3.h | 4 +- .../ExternalDataSourceConfiguration.h | 2 +- src/Storages/StorageS3.cpp | 54 +++++++++---------- src/Storages/StorageS3.h | 12 ++--- src/Storages/StorageS3Cluster.cpp | 2 +- src/Storages/StorageS3Settings.cpp | 26 ++++----- src/Storages/StorageS3Settings.h | 12 ++--- 17 files changed, 115 insertions(+), 115 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 3f723c98a2a..f794500980b 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -87,7 +87,7 @@ BackupReaderS3::BackupReaderS3( : s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) , read_settings(context_->getReadSettings()) - , rw_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).rw_settings) + , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) { } @@ -115,7 +115,7 @@ UInt64 BackupReaderS3::getFileSize(const String & file_name) std::unique_ptr BackupReaderS3::readFile(const String & file_name) { return std::make_unique( - client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, rw_settings, read_settings); + client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, request_settings, read_settings); } @@ -124,9 +124,9 @@ BackupWriterS3::BackupWriterS3( : s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) , read_settings(context_->getReadSettings()) - , rw_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).rw_settings) + , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) { - rw_settings.updateFromSettingsIfEmpty(context_->getSettingsRef()); + request_settings.updateFromSettingsIfEmpty(context_->getSettingsRef()); } DataSourceDescription BackupWriterS3::getDataSourceDescription() const @@ -212,7 +212,7 @@ void BackupWriterS3::copyObjectMultipartImpl( std::vector part_tags; - size_t upload_part_size = rw_settings.min_upload_part_size; + size_t upload_part_size = request_settings.min_upload_part_size; for (size_t position = 0, part_number = 1; position < size; ++part_number, position += upload_part_size) { Aws::S3::Model::UploadPartCopyRequest part_request; @@ -317,7 +317,7 @@ bool BackupWriterS3::fileContentsEqual(const String & file_name, const String & try { auto in = std::make_unique( - client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, rw_settings, read_settings); + client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, request_settings, read_settings); String actual_file_contents(expected_file_contents.size(), ' '); return (in->read(actual_file_contents.data(), actual_file_contents.size()) == actual_file_contents.size()) && (actual_file_contents == expected_file_contents) && in->eof(); @@ -335,7 +335,7 @@ std::unique_ptr BackupWriterS3::writeFile(const String & file_name) client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, - rw_settings, + request_settings, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, threadPoolCallbackRunner(IOThreadPool::get(), "BackupWriterS3")); diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index 17a365da396..99d7558ac9e 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -40,7 +40,7 @@ private: S3::URI s3_uri; std::shared_ptr client; ReadSettings read_settings; - S3Settings::ReadWriteSettings rw_settings; + S3Settings::RequestSettings request_settings; }; @@ -83,7 +83,7 @@ private: S3::URI s3_uri; std::shared_ptr client; ReadSettings read_settings; - S3Settings::ReadWriteSettings rw_settings; + S3Settings::RequestSettings request_settings; }; } diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index 8777ef341ff..cc837f5f496 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -135,8 +135,8 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa if (s3_client == nullptr) return; - S3Settings::ReadWriteSettings read_write_settings; - read_write_settings.upload_part_size_multiply_parts_count_threshold = 10000; + S3Settings::RequestSettings request_settings_1; + request_settings_1.upload_part_size_multiply_parts_count_threshold = 10000; const auto create_writer = [&](const auto & key) { @@ -145,7 +145,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa s3_client->client, s3_client->uri.bucket, key, - read_write_settings + request_settings_1 }; }; @@ -194,15 +194,15 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa lock_writer.finalize(); // We read back the written UUID, if it's the same we can upload the file - S3Settings::ReadWriteSettings rw_settings; - rw_settings.max_single_read_retries = 1; + S3Settings::RequestSettings request_settings_2; + request_settings_2.max_single_read_retries = 1; ReadBufferFromS3 lock_reader { s3_client->client, s3_client->uri.bucket, lock_file, "", - rw_settings, + request_settings_2, {} }; diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 21b7ff4f0f1..bbb5c4cddc5 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -175,7 +175,7 @@ std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT bucket, path, version_id, - settings_ptr->s3_settings, + settings_ptr->request_settings, disk_read_settings, /* use_external_buffer */true, /* offset */0, @@ -212,7 +212,7 @@ std::unique_ptr S3ObjectStorage::readObject( /// NOLINT bucket, object.absolute_path, version_id, - settings_ptr->s3_settings, + settings_ptr->request_settings, patchSettings(read_settings)); } @@ -238,7 +238,7 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN client.get(), bucket, object.absolute_path, - settings_ptr->s3_settings, + settings_ptr->request_settings, attributes, buf_size, std::move(scheduler), @@ -473,7 +473,7 @@ void S3ObjectStorage::copyObjectImpl( throwIfError(outcome); auto settings_ptr = s3_settings.get(); - if (settings_ptr->s3_settings.check_objects_after_upload) + if (settings_ptr->request_settings.check_objects_after_upload) { auto object_head = requestObjectHeadData(dst_bucket, dst_key); if (!object_head.IsSuccess()) @@ -517,7 +517,7 @@ void S3ObjectStorage::copyObjectMultipartImpl( std::vector part_tags; - size_t upload_part_size = settings_ptr->s3_settings.min_upload_part_size; + size_t upload_part_size = settings_ptr->request_settings.min_upload_part_size; for (size_t position = 0, part_number = 1; position < size; ++part_number, position += upload_part_size) { ProfileEvents::increment(ProfileEvents::S3UploadPartCopy); @@ -570,7 +570,7 @@ void S3ObjectStorage::copyObjectMultipartImpl( throwIfError(outcome); } - if (settings_ptr->s3_settings.check_objects_after_upload) + if (settings_ptr->request_settings.check_objects_after_upload) { auto object_head = requestObjectHeadData(dst_bucket, dst_key); if (!object_head.IsSuccess()) diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 6b1e8289b15..a737d3bc114 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -23,17 +23,17 @@ struct S3ObjectStorageSettings S3ObjectStorageSettings() = default; S3ObjectStorageSettings( - const S3Settings::ReadWriteSettings & s3_settings_, + const S3Settings::RequestSettings & request_settings_, uint64_t min_bytes_for_seek_, int32_t list_object_keys_size_, int32_t objects_chunk_size_to_delete_) - : s3_settings(s3_settings_) + : request_settings(request_settings_) , min_bytes_for_seek(min_bytes_for_seek_) , list_object_keys_size(list_object_keys_size_) , objects_chunk_size_to_delete(objects_chunk_size_to_delete_) {} - S3Settings::ReadWriteSettings s3_settings; + S3Settings::RequestSettings request_settings; uint64_t min_bytes_for_seek; int32_t list_object_keys_size; diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 707675c01fd..ca2e9d04926 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -34,25 +34,25 @@ namespace ErrorCodes std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { const Settings & settings = context->getSettingsRef(); - S3Settings::ReadWriteSettings rw_settings; - rw_settings.max_single_read_retries = config.getUInt64(config_prefix + ".s3_max_single_read_retries", settings.s3_max_single_read_retries); - rw_settings.min_upload_part_size = config.getUInt64(config_prefix + ".s3_min_upload_part_size", settings.s3_min_upload_part_size); - rw_settings.upload_part_size_multiply_factor = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_factor", settings.s3_upload_part_size_multiply_factor); - rw_settings.upload_part_size_multiply_parts_count_threshold = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_parts_count_threshold", settings.s3_upload_part_size_multiply_parts_count_threshold); - rw_settings.max_single_part_upload_size = config.getUInt64(config_prefix + ".s3_max_single_part_upload_size", settings.s3_max_single_part_upload_size); - rw_settings.check_objects_after_upload = config.getUInt64(config_prefix + ".s3_check_objects_after_upload", settings.s3_check_objects_after_upload); - rw_settings.max_unexpected_write_error_retries = config.getUInt64(config_prefix + ".s3_max_unexpected_write_error_retries", settings.s3_max_unexpected_write_error_retries); + S3Settings::RequestSettings request_settings; + request_settings.max_single_read_retries = config.getUInt64(config_prefix + ".s3_max_single_read_retries", settings.s3_max_single_read_retries); + request_settings.min_upload_part_size = config.getUInt64(config_prefix + ".s3_min_upload_part_size", settings.s3_min_upload_part_size); + request_settings.upload_part_size_multiply_factor = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_factor", settings.s3_upload_part_size_multiply_factor); + request_settings.upload_part_size_multiply_parts_count_threshold = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_parts_count_threshold", settings.s3_upload_part_size_multiply_parts_count_threshold); + request_settings.max_single_part_upload_size = config.getUInt64(config_prefix + ".s3_max_single_part_upload_size", settings.s3_max_single_part_upload_size); + request_settings.check_objects_after_upload = config.getUInt64(config_prefix + ".s3_check_objects_after_upload", settings.s3_check_objects_after_upload); + request_settings.max_unexpected_write_error_retries = config.getUInt64(config_prefix + ".s3_max_unexpected_write_error_retries", settings.s3_max_unexpected_write_error_retries); // NOTE: it would be better to reuse old throttlers to avoid losing token bucket state on every config reload, which could lead to exceeding limit for short time. But it is good enough unless very high `burst` values are used. if (UInt64 max_get_rps = config.getUInt64(config_prefix + ".s3_max_get_rps", settings.s3_max_get_rps)) - rw_settings.get_request_throttler = std::make_shared( + request_settings.get_request_throttler = std::make_shared( max_get_rps, config.getUInt64(config_prefix + ".s3_max_get_burst", settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * max_get_rps)); if (UInt64 max_put_rps = config.getUInt64(config_prefix + ".s3_max_put_rps", settings.s3_max_put_rps)) - rw_settings.put_request_throttler = std::make_shared( + request_settings.put_request_throttler = std::make_shared( max_put_rps, config.getUInt64(config_prefix + ".s3_max_put_burst", settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * max_put_rps)); return std::make_unique( - rw_settings, + request_settings, config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), config.getInt(config_prefix + ".list_object_keys_size", 1000), config.getInt(config_prefix + ".objects_chunk_size_to_delete", 1000)); @@ -134,8 +134,8 @@ std::unique_ptr getClient( static_cast(context->getGlobalContext()->getSettingsRef().s3_max_redirects), context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging, /* for_disk_s3 = */ true, - settings.s3_settings.get_request_throttler, - settings.s3_settings.put_request_throttler); + settings.request_settings.get_request_throttler, + settings.request_settings.put_request_throttler); S3::URI uri(Poco::URI(config.getString(config_prefix + ".endpoint"))); if (uri.key.back() != '/') diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index fa748469ca0..c14fbecf223 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -45,7 +45,7 @@ ReadBufferFromS3::ReadBufferFromS3( const String & bucket_, const String & key_, const String & version_id_, - const S3Settings::ReadWriteSettings & s3_settings_, + const S3Settings::RequestSettings & request_settings_, const ReadSettings & settings_, bool use_external_buffer_, size_t offset_, @@ -56,7 +56,7 @@ ReadBufferFromS3::ReadBufferFromS3( , bucket(bucket_) , key(key_) , version_id(version_id_) - , s3_settings(s3_settings_) + , request_settings(request_settings_) , offset(offset_) , read_until_position(read_until_position_) , read_settings(settings_) @@ -105,7 +105,7 @@ bool ReadBufferFromS3::nextImpl() } size_t sleep_time_with_backoff_milliseconds = 100; - for (size_t attempt = 0; attempt < s3_settings.max_single_read_retries && !next_result; ++attempt) + for (size_t attempt = 0; attempt < request_settings.max_single_read_retries && !next_result; ++attempt) { Stopwatch watch; try @@ -166,7 +166,7 @@ bool ReadBufferFromS3::nextImpl() attempt, e.message()); - if (attempt + 1 == s3_settings.max_single_read_retries) + if (attempt + 1 == request_settings.max_single_read_retries) throw; /// Pause before next attempt. @@ -349,7 +349,7 @@ SeekableReadBufferPtr ReadBufferS3Factory::getReader() bucket, key, version_id, - s3_settings, + request_settings, read_settings, false /*use_external_buffer*/, next_range->first, diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 3e1a26f7713..17b13bf7d62 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -34,7 +34,7 @@ private: String bucket; String key; String version_id; - const S3Settings::ReadWriteSettings s3_settings; + const S3Settings::RequestSettings request_settings; /// These variables are atomic because they can be used for `logging only` /// (where it is not important to get consistent result) @@ -53,7 +53,7 @@ public: const String & bucket_, const String & key_, const String & version_id_, - const S3Settings::ReadWriteSettings & s3_settings_, + const S3Settings::RequestSettings & request_settings_, const ReadSettings & settings_, bool use_external_buffer = false, size_t offset_ = 0, @@ -101,7 +101,7 @@ public: const String & version_id_, size_t range_step_, size_t object_size_, - const S3Settings::ReadWriteSettings & s3_settings_, + const S3Settings::RequestSettings & request_settings_, const ReadSettings & read_settings_) : client_ptr(client_ptr_) , bucket(bucket_) @@ -111,7 +111,7 @@ public: , range_generator(object_size_, range_step_) , range_step(range_step_) , object_size(object_size_) - , s3_settings(s3_settings_) + , request_settings(request_settings_) { assert(range_step > 0); assert(range_step < object_size); @@ -136,7 +136,7 @@ private: size_t range_step; size_t object_size; - const S3Settings::ReadWriteSettings s3_settings; + const S3Settings::RequestSettings request_settings; }; } diff --git a/src/IO/S3/tests/gtest_aws_s3_client.cpp b/src/IO/S3/tests/gtest_aws_s3_client.cpp index 96a7c8b3dfb..2d298ca5df2 100644 --- a/src/IO/S3/tests/gtest_aws_s3_client.cpp +++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp @@ -115,14 +115,14 @@ TEST(IOTestAwsS3Client, AppendExtraSSECHeaders) ASSERT_TRUE(client); DB::ReadSettings read_settings; - DB::S3Settings::ReadWriteSettings rw_settings; - rw_settings.max_single_read_retries = max_single_read_retries; + DB::S3Settings::RequestSettings request_settings; + request_settings.max_single_read_retries = max_single_read_retries; DB::ReadBufferFromS3 read_buffer( client, uri.bucket, uri.key, version_id, - rw_settings, + request_settings, read_settings ); diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index f823015bd7d..d3a5b185248 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -71,7 +71,7 @@ WriteBufferFromS3::WriteBufferFromS3( std::shared_ptr client_ptr_, const String & bucket_, const String & key_, - const S3Settings::ReadWriteSettings & s3_settings_, + const S3Settings::RequestSettings & request_settings_, std::optional> object_metadata_, size_t buffer_size_, ThreadPoolCallbackRunner schedule_, @@ -79,10 +79,10 @@ WriteBufferFromS3::WriteBufferFromS3( : BufferWithOwnMemory(buffer_size_, nullptr, 0) , bucket(bucket_) , key(key_) - , s3_settings(s3_settings_) + , request_settings(request_settings_) , client_ptr(std::move(client_ptr_)) , object_metadata(std::move(object_metadata_)) - , upload_part_size(s3_settings_.min_upload_part_size) + , upload_part_size(request_settings_.min_upload_part_size) , schedule(std::move(schedule_)) , write_settings(write_settings_) { @@ -107,7 +107,7 @@ void WriteBufferFromS3::nextImpl() write_settings.remote_throttler->add(offset()); /// Data size exceeds singlepart upload threshold, need to use multipart upload. - if (multipart_upload_id.empty() && last_part_size > s3_settings.max_single_part_upload_size) + if (multipart_upload_id.empty() && last_part_size > request_settings.max_single_part_upload_size) createMultipartUpload(); if (!multipart_upload_id.empty() && last_part_size > upload_part_size) @@ -122,8 +122,8 @@ void WriteBufferFromS3::nextImpl() void WriteBufferFromS3::allocateBuffer() { - if (total_parts_uploaded != 0 && total_parts_uploaded % s3_settings.upload_part_size_multiply_parts_count_threshold == 0) - upload_part_size *= s3_settings.upload_part_size_multiply_factor; + if (total_parts_uploaded != 0 && total_parts_uploaded % request_settings.upload_part_size_multiply_parts_count_threshold == 0) + upload_part_size *= request_settings.upload_part_size_multiply_factor; temporary_buffer = Aws::MakeShared("temporary buffer"); temporary_buffer->exceptions(std::ios::badbit); @@ -177,7 +177,7 @@ void WriteBufferFromS3::finalizeImpl() if (!multipart_upload_id.empty()) completeMultipartUpload(); - if (s3_settings.check_objects_after_upload) + if (request_settings.check_objects_after_upload) { LOG_TRACE(log, "Checking object {} exists after upload", key); @@ -367,7 +367,7 @@ void WriteBufferFromS3::completeMultipartUpload() req.SetMultipartUpload(multipart_upload); - size_t max_retry = std::max(s3_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); for (size_t i = 0; i < max_retry; ++i) { ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload); @@ -473,7 +473,7 @@ void WriteBufferFromS3::fillPutRequest(Aws::S3::Model::PutObjectRequest & req) void WriteBufferFromS3::processPutRequest(const PutObjectTask & task) { - size_t max_retry = std::max(s3_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); for (size_t i = 0; i < max_retry; ++i) { ProfileEvents::increment(ProfileEvents::S3PutObject); diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 28f831856d7..b4b5a6d37a3 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -50,7 +50,7 @@ public: std::shared_ptr client_ptr_, const String & bucket_, const String & key_, - const S3Settings::ReadWriteSettings & s3_settings_, + const S3Settings::RequestSettings & request_settings_, std::optional> object_metadata_ = std::nullopt, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, ThreadPoolCallbackRunner schedule_ = {}, @@ -88,7 +88,7 @@ private: const String bucket; const String key; - const S3Settings::ReadWriteSettings s3_settings; + const S3Settings::RequestSettings request_settings; const std::shared_ptr client_ptr; const std::optional> object_metadata; diff --git a/src/Storages/ExternalDataSourceConfiguration.h b/src/Storages/ExternalDataSourceConfiguration.h index 5736336983a..a8c27e3d1d4 100644 --- a/src/Storages/ExternalDataSourceConfiguration.h +++ b/src/Storages/ExternalDataSourceConfiguration.h @@ -118,7 +118,7 @@ struct URLBasedDataSourceConfiguration struct StorageS3Configuration : URLBasedDataSourceConfiguration { S3::AuthSettings auth_settings; - S3Settings::ReadWriteSettings rw_settings; + S3Settings::RequestSettings request_settings; }; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 9309e3f7384..cc13838a4c7 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -101,7 +101,7 @@ public: ContextPtr context_, std::unordered_map * object_infos_, Strings * read_keys_, - const S3Settings::ReadWriteSettings & rw_settings_) + const S3Settings::RequestSettings & request_settings_) : WithContext(context_) , client(client_) , globbed_uri(globbed_uri_) @@ -109,7 +109,7 @@ public: , virtual_header(virtual_header_) , object_infos(object_infos_) , read_keys(read_keys_) - , rw_settings(rw_settings_) + , request_settings(request_settings_) { if (globbed_uri.bucket.find_first_of("*?{") != globbed_uri.bucket.npos) throw Exception("Expression can not have wildcards inside bucket name", ErrorCodes::UNEXPECTED_EXPRESSION); @@ -260,7 +260,7 @@ private: bool is_finished{false}; std::unordered_map * object_infos; Strings * read_keys; - S3Settings::ReadWriteSettings rw_settings; + S3Settings::RequestSettings request_settings; }; StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( @@ -271,8 +271,8 @@ StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( ContextPtr context, std::unordered_map * object_infos_, Strings * read_keys_, - const S3Settings::ReadWriteSettings & rw_settings_) - : pimpl(std::make_shared(client_, globbed_uri_, query, virtual_header, context, object_infos_, read_keys_, rw_settings_)) + const S3Settings::RequestSettings & request_settings_) + : pimpl(std::make_shared(client_, globbed_uri_, query, virtual_header, context, object_infos_, read_keys_, request_settings_)) { } @@ -385,7 +385,7 @@ StorageS3Source::StorageS3Source( std::optional format_settings_, const ColumnsDescription & columns_, UInt64 max_block_size_, - const S3Settings::ReadWriteSettings & rw_settings_, + const S3Settings::RequestSettings & request_settings_, String compression_hint_, const std::shared_ptr & client_, const String & bucket_, @@ -401,7 +401,7 @@ StorageS3Source::StorageS3Source( , format(format_) , columns_desc(columns_) , max_block_size(max_block_size_) - , rw_settings(rw_settings_) + , request_settings(request_settings_) , compression_hint(std::move(compression_hint_)) , client(client_) , sample_block(sample_block_) @@ -467,7 +467,7 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k if (!use_parallel_download || object_too_small) { LOG_TRACE(log, "Downloading object of size {} from S3 in single thread", object_size); - return std::make_unique(client, bucket, key, version_id, rw_settings, getContext()->getReadSettings()); + return std::make_unique(client, bucket, key, version_id, request_settings, getContext()->getReadSettings()); } assert(object_size > 0); @@ -479,7 +479,7 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k } auto factory = std::make_unique( - client, bucket, key, version_id, download_buffer_size, object_size, rw_settings, getContext()->getReadSettings()); + client, bucket, key, version_id, download_buffer_size, object_size, request_settings, getContext()->getReadSettings()); LOG_TRACE( log, "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", download_thread_num, object_size, download_buffer_size); @@ -589,7 +589,7 @@ public: s3_configuration_.client, bucket, key, - s3_configuration_.rw_settings, + s3_configuration_.request_settings, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, threadPoolCallbackRunner(IOThreadPool::get(), "S3ParallelRead"), @@ -753,7 +753,7 @@ StorageS3::StorageS3( bool distributed_processing_, ASTPtr partition_by_) : IStorage(table_id_) - , s3_configuration{configuration_.url, configuration_.auth_settings, configuration_.rw_settings, configuration_.headers} + , s3_configuration{configuration_.url, configuration_.auth_settings, configuration_.request_settings, configuration_.headers} , keys({s3_configuration.uri.key}) , format_name(configuration_.format) , compression_method(configuration_.compression_method) @@ -819,7 +819,7 @@ std::shared_ptr StorageS3::createFileIterator( { /// Iterate through disclosed globs and make a source for each file auto glob_iterator = std::make_shared( - *s3_configuration.client, s3_configuration.uri, query, virtual_block, local_context, object_infos, read_keys, s3_configuration.rw_settings); + *s3_configuration.client, s3_configuration.uri, query, virtual_block, local_context, object_infos, read_keys, s3_configuration.request_settings); return std::make_shared([glob_iterator]() { return glob_iterator->next(); }); } else @@ -909,7 +909,7 @@ Pipe StorageS3::read( format_settings, columns_description, max_block_size, - s3_configuration.rw_settings, + s3_configuration.request_settings, compression_method, s3_configuration.client, s3_configuration.uri.bucket, @@ -1026,10 +1026,10 @@ void StorageS3::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, void StorageS3::updateS3Configuration(ContextPtr ctx, StorageS3::S3Configuration & upd) { auto settings = ctx->getStorageS3Settings().getSettings(upd.uri.uri.toString()); - if (upd.rw_settings != settings.rw_settings) - upd.rw_settings = settings.rw_settings; + if (upd.request_settings != settings.request_settings) + upd.request_settings = settings.request_settings; - upd.rw_settings.updateFromSettingsIfEmpty(ctx->getSettings()); + upd.request_settings.updateFromSettingsIfEmpty(ctx->getSettings()); if (upd.client) { @@ -1048,11 +1048,11 @@ void StorageS3::updateS3Configuration(ContextPtr ctx, StorageS3::S3Configuration static_cast(ctx->getGlobalContext()->getSettingsRef().s3_max_redirects), ctx->getGlobalContext()->getSettingsRef().enable_s3_requests_logging, /* for_disk_s3 = */ false, - upd.rw_settings.get_request_throttler, - upd.rw_settings.put_request_throttler); + upd.request_settings.get_request_throttler, + upd.request_settings.put_request_throttler); client_configuration.endpointOverride = upd.uri.endpoint; - client_configuration.maxConnections = static_cast(upd.rw_settings.max_connections); + client_configuration.maxConnections = static_cast(upd.request_settings.max_connections); auto credentials = Aws::Auth::AWSCredentials(upd.auth_settings.access_key_id, upd.auth_settings.secret_access_key); auto headers = upd.auth_settings.headers; @@ -1084,17 +1084,17 @@ void StorageS3::processNamedCollectionResult(StorageS3Configuration & configurat else if (arg_name == "use_environment_credentials") configuration.auth_settings.use_environment_credentials = checkAndGetLiteralArgument(arg_value, "use_environment_credentials"); else if (arg_name == "max_single_read_retries") - configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "max_single_read_retries"); + configuration.request_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "max_single_read_retries"); else if (arg_name == "min_upload_part_size") - configuration.rw_settings.min_upload_part_size = checkAndGetLiteralArgument(arg_value, "min_upload_part_size"); + configuration.request_settings.min_upload_part_size = checkAndGetLiteralArgument(arg_value, "min_upload_part_size"); else if (arg_name == "upload_part_size_multiply_factor") - configuration.rw_settings.upload_part_size_multiply_factor = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_factor"); + configuration.request_settings.upload_part_size_multiply_factor = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_factor"); else if (arg_name == "upload_part_size_multiply_parts_count_threshold") - configuration.rw_settings.upload_part_size_multiply_parts_count_threshold = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_parts_count_threshold"); + configuration.request_settings.upload_part_size_multiply_parts_count_threshold = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_parts_count_threshold"); else if (arg_name == "max_single_part_upload_size") - configuration.rw_settings.max_single_part_upload_size = checkAndGetLiteralArgument(arg_value, "max_single_part_upload_size"); + configuration.request_settings.max_single_part_upload_size = checkAndGetLiteralArgument(arg_value, "max_single_part_upload_size"); else if (arg_name == "max_connections") - configuration.rw_settings.max_connections = checkAndGetLiteralArgument(arg_value, "max_connections"); + configuration.request_settings.max_connections = checkAndGetLiteralArgument(arg_value, "max_connections"); else throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Unknown key-value argument `{}` for StorageS3, expected: url, [access_key_id, secret_access_key], name of used format and [compression_method].", @@ -1162,7 +1162,7 @@ ColumnsDescription StorageS3::getTableStructureFromData( S3Configuration s3_configuration{ configuration.url, configuration.auth_settings, - S3Settings::ReadWriteSettings(ctx->getSettingsRef()), + S3Settings::RequestSettings(ctx->getSettingsRef()), configuration.headers}; updateS3Configuration(ctx, s3_configuration); @@ -1224,7 +1224,7 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( int zstd_window_log_max = static_cast(ctx->getSettingsRef().zstd_window_log_max); return wrapReadBufferWithCompressionMethod( std::make_unique( - s3_configuration.client, s3_configuration.uri.bucket, key, s3_configuration.uri.version_id, s3_configuration.rw_settings, ctx->getReadSettings()), + s3_configuration.client, s3_configuration.uri.bucket, key, s3_configuration.uri.version_id, s3_configuration.request_settings, ctx->getReadSettings()), chooseCompressionMethod(key, compression_method), zstd_window_log_max); }; diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 76391cb2695..8c15ede2b0b 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -44,7 +44,7 @@ public: ContextPtr context, std::unordered_map * object_infos = nullptr, Strings * read_keys_ = nullptr, - const S3Settings::ReadWriteSettings & rw_settings_ = {}); + const S3Settings::RequestSettings & request_settings_ = {}); String next(); @@ -80,7 +80,7 @@ public: std::optional format_settings_, const ColumnsDescription & columns_, UInt64 max_block_size_, - const S3Settings::ReadWriteSettings & rw_settings_, + const S3Settings::RequestSettings & request_settings_, String compression_hint_, const std::shared_ptr & client_, const String & bucket, @@ -103,7 +103,7 @@ private: String format; ColumnsDescription columns_desc; UInt64 max_block_size; - S3Settings::ReadWriteSettings rw_settings; + S3Settings::RequestSettings request_settings; String compression_hint; std::shared_ptr client; Block sample_block; @@ -187,7 +187,7 @@ public: std::shared_ptr client; S3::AuthSettings auth_settings; - S3Settings::ReadWriteSettings rw_settings; + S3Settings::RequestSettings request_settings; /// If s3 configuration was passed from ast, then it is static. /// If from config - it can be changed with config reload. @@ -199,11 +199,11 @@ public: S3Configuration( const String & url_, const S3::AuthSettings & auth_settings_, - const S3Settings::ReadWriteSettings & rw_settings_, + const S3Settings::RequestSettings & request_settings_, const HeaderCollection & headers_from_ast_) : uri(S3::URI(url_)) , auth_settings(auth_settings_) - , rw_settings(rw_settings_) + , request_settings(request_settings_) , static_configuration(!auth_settings_.access_key_id.empty()) , headers_from_ast(headers_from_ast_) {} }; diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp index 3b8c8b1cb92..920116527ee 100644 --- a/src/Storages/StorageS3Cluster.cpp +++ b/src/Storages/StorageS3Cluster.cpp @@ -46,7 +46,7 @@ StorageS3Cluster::StorageS3Cluster( const ConstraintsDescription & constraints_, ContextPtr context_) : IStorage(table_id_) - , s3_configuration{configuration_.url, configuration_.auth_settings, configuration_.rw_settings, configuration_.headers} + , s3_configuration{configuration_.url, configuration_.auth_settings, configuration_.request_settings, configuration_.headers} , filename(configuration_.url) , cluster_name(configuration_.cluster_name) , format_name(configuration_.format) diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index 9e670c65e17..e24b892dbac 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -48,24 +48,24 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U auto auth_settings = S3::AuthSettings::loadFromConfig(config_elem + "." + key, config); - S3Settings::ReadWriteSettings rw_settings; - rw_settings.max_single_read_retries = get_uint_for_key(key, "max_single_read_retries", true, settings.s3_max_single_read_retries); - rw_settings.min_upload_part_size = get_uint_for_key(key, "min_upload_part_size", true, settings.s3_min_upload_part_size); - rw_settings.upload_part_size_multiply_factor = get_uint_for_key(key, "upload_part_size_multiply_factor", true, settings.s3_upload_part_size_multiply_factor); - rw_settings.upload_part_size_multiply_parts_count_threshold = get_uint_for_key(key, "upload_part_size_multiply_parts_count_threshold", true, settings.s3_upload_part_size_multiply_parts_count_threshold); - rw_settings.max_single_part_upload_size = get_uint_for_key(key, "max_single_part_upload_size", true, settings.s3_max_single_part_upload_size); - rw_settings.max_connections = get_uint_for_key(key, "max_connections", true, settings.s3_max_connections); - rw_settings.check_objects_after_upload = get_bool_for_key(key, "check_objects_after_upload", true, false); + S3Settings::RequestSettings request_settings; + request_settings.max_single_read_retries = get_uint_for_key(key, "max_single_read_retries", true, settings.s3_max_single_read_retries); + request_settings.min_upload_part_size = get_uint_for_key(key, "min_upload_part_size", true, settings.s3_min_upload_part_size); + request_settings.upload_part_size_multiply_factor = get_uint_for_key(key, "upload_part_size_multiply_factor", true, settings.s3_upload_part_size_multiply_factor); + request_settings.upload_part_size_multiply_parts_count_threshold = get_uint_for_key(key, "upload_part_size_multiply_parts_count_threshold", true, settings.s3_upload_part_size_multiply_parts_count_threshold); + request_settings.max_single_part_upload_size = get_uint_for_key(key, "max_single_part_upload_size", true, settings.s3_max_single_part_upload_size); + request_settings.max_connections = get_uint_for_key(key, "max_connections", true, settings.s3_max_connections); + request_settings.check_objects_after_upload = get_bool_for_key(key, "check_objects_after_upload", true, false); // NOTE: it would be better to reuse old throttlers to avoid losing token bucket state on every config reload, which could lead to exceeding limit for short time. But it is good enough unless very high `burst` values are used. if (UInt64 max_get_rps = get_uint_for_key(key, "max_get_rps", true, settings.s3_max_get_rps)) - rw_settings.get_request_throttler = std::make_shared( + request_settings.get_request_throttler = std::make_shared( max_get_rps, get_uint_for_key(key, "max_get_burst", true, settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * max_get_rps)); if (UInt64 max_put_rps = get_uint_for_key(key, "max_put_rps", true, settings.s3_max_put_rps)) - rw_settings.put_request_throttler = std::make_shared( + request_settings.put_request_throttler = std::make_shared( max_put_rps, get_uint_for_key(key, "max_put_burst", true, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * max_put_rps)); - s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(rw_settings)}); + s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(request_settings)}); } } } @@ -86,7 +86,7 @@ S3Settings StorageS3Settings::getSettings(const String & endpoint) const return {}; } -S3Settings::ReadWriteSettings::ReadWriteSettings(const Settings & settings) +S3Settings::RequestSettings::RequestSettings(const Settings & settings) { max_single_read_retries = settings.s3_max_single_read_retries; min_upload_part_size = settings.s3_min_upload_part_size; @@ -104,7 +104,7 @@ S3Settings::ReadWriteSettings::ReadWriteSettings(const Settings & settings) settings.s3_max_put_rps, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * settings.s3_max_put_rps); } -void S3Settings::ReadWriteSettings::updateFromSettingsIfEmpty(const Settings & settings) +void S3Settings::RequestSettings::updateFromSettingsIfEmpty(const Settings & settings) { if (!max_single_read_retries) max_single_read_retries = settings.s3_max_single_read_retries; diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h index 40ba11a19ba..509a690ef8c 100644 --- a/src/Storages/StorageS3Settings.h +++ b/src/Storages/StorageS3Settings.h @@ -24,7 +24,7 @@ struct Settings; struct S3Settings { - struct ReadWriteSettings + struct RequestSettings { size_t max_single_read_retries = 0; size_t min_upload_part_size = 0; @@ -37,10 +37,10 @@ struct S3Settings ThrottlerPtr get_request_throttler; ThrottlerPtr put_request_throttler; - ReadWriteSettings() = default; - explicit ReadWriteSettings(const Settings & settings); + RequestSettings() = default; + explicit RequestSettings(const Settings & settings); - inline bool operator==(const ReadWriteSettings & other) const + inline bool operator==(const RequestSettings & other) const { return max_single_read_retries == other.max_single_read_retries && min_upload_part_size == other.min_upload_part_size @@ -58,11 +58,11 @@ struct S3Settings }; S3::AuthSettings auth_settings; - ReadWriteSettings rw_settings; + RequestSettings request_settings; inline bool operator==(const S3Settings & other) const { - return auth_settings == other.auth_settings && rw_settings == other.rw_settings; + return auth_settings == other.auth_settings && request_settings == other.request_settings; } }; From 2de26daa5630e38076f96ba600cb1594edab32d7 Mon Sep 17 00:00:00 2001 From: serxa Date: Tue, 8 Nov 2022 14:31:29 +0000 Subject: [PATCH 08/78] fix build --- src/Backups/BackupIO_S3.cpp | 8 ++++---- src/Storages/StorageDelta.cpp | 6 ++++-- src/Storages/StorageHudi.cpp | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index e953b441c1b..5a4e804a778 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -247,10 +247,10 @@ void BackupWriterS3::copyObjectMultipartImpl( position = next_position; - if (part_number % rw_settings.upload_part_size_multiply_parts_count_threshold == 0) + if (part_number % request_settings.upload_part_size_multiply_parts_count_threshold == 0) { - upload_part_size *= rw_settings.upload_part_size_multiply_factor; - upload_part_size = std::min(upload_part_size, rw_settings.max_upload_part_size); + upload_part_size *= request_settings.upload_part_size_multiply_factor; + upload_part_size = std::min(upload_part_size, request_settings.max_upload_part_size); } } @@ -293,7 +293,7 @@ void BackupWriterS3::copyFileNative(DiskPtr from_disk, const String & file_name_ auto file_path = fs::path(s3_uri.key) / file_name_to; auto head = requestObjectHeadData(source_bucket, objects[0].absolute_path).GetResult(); - if (static_cast(head.GetContentLength()) < rw_settings.max_single_operation_copy_size) + if (static_cast(head.GetContentLength()) < request_settings.max_single_operation_copy_size) { copyObjectImpl( source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, head); diff --git a/src/Storages/StorageDelta.cpp b/src/Storages/StorageDelta.cpp index e8287a2fd61..c077b24c610 100644 --- a/src/Storages/StorageDelta.cpp +++ b/src/Storages/StorageDelta.cpp @@ -151,12 +151,14 @@ std::vector JsonMetadataGetter::getJsonLogFiles() std::shared_ptr JsonMetadataGetter::createS3ReadBuffer(const String & key, ContextPtr context) { /// TODO: add parallel downloads + S3Settings::RequestSettings request_settings; + request_settings.max_single_read_retries = 10; return std::make_shared( base_configuration.client, base_configuration.uri.bucket, key, base_configuration.uri.version_id, - /* max single read retries */10, + request_settings, context->getReadSettings()); } @@ -187,7 +189,7 @@ StorageDelta::StorageDelta( ContextPtr context_, std::optional format_settings_) : IStorage(table_id_) - , base_configuration{configuration_.url, configuration_.auth_settings, configuration_.rw_settings, configuration_.headers} + , base_configuration{configuration_.url, configuration_.auth_settings, configuration_.request_settings, configuration_.headers} , log(&Poco::Logger::get("StorageDeltaLake (" + table_id_.table_name + ")")) , table_path(base_configuration.uri.key) { diff --git a/src/Storages/StorageHudi.cpp b/src/Storages/StorageHudi.cpp index 121856c4a57..4b20e4cbd22 100644 --- a/src/Storages/StorageHudi.cpp +++ b/src/Storages/StorageHudi.cpp @@ -37,7 +37,7 @@ StorageHudi::StorageHudi( ContextPtr context_, std::optional format_settings_) : IStorage(table_id_) - , base_configuration{configuration_.url, configuration_.auth_settings, configuration_.rw_settings, configuration_.headers} + , base_configuration{configuration_.url, configuration_.auth_settings, configuration_.request_settings, configuration_.headers} , log(&Poco::Logger::get("StorageHudi (" + table_id_.table_name + ")")) , table_path(base_configuration.uri.key) { From 2ad8c5829fa1381214858cf20de98fcc33d96fac Mon Sep 17 00:00:00 2001 From: serxa Date: Wed, 9 Nov 2022 14:18:56 +0000 Subject: [PATCH 09/78] add test --- .../02477_s3_request_throttler.reference | 1 + .../0_stateless/02477_s3_request_throttler.sql | 15 +++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 tests/queries/0_stateless/02477_s3_request_throttler.reference create mode 100644 tests/queries/0_stateless/02477_s3_request_throttler.sql diff --git a/tests/queries/0_stateless/02477_s3_request_throttler.reference b/tests/queries/0_stateless/02477_s3_request_throttler.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02477_s3_request_throttler.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02477_s3_request_throttler.sql b/tests/queries/0_stateless/02477_s3_request_throttler.sql new file mode 100644 index 00000000000..7311c8ac9bc --- /dev/null +++ b/tests/queries/0_stateless/02477_s3_request_throttler.sql @@ -0,0 +1,15 @@ +-- Tags: no-fasttest +-- Tag no-fasttest: needs s3 + +-- Limit S3 PUT request per second rate +SET s3_max_put_rps = 2; +SET s3_max_put_burst = 1; + +CREATE TEMPORARY TABLE times (t DateTime); + +-- INSERT query requires 3 PUT requests (CreateMultipartUpload, UploadPart, CompleteMultipartUpload) and 1/rps = 0.5 second in between, the first query is not throttled due to burst +INSERT INTO times SELECT now(); +INSERT INTO TABLE FUNCTION s3('http://localhost:11111/test/request-throttler.csv', 'test', 'testtest', 'CSV', 'number UInt64') SELECT number FROM numbers(1000000) SETTINGS s3_max_single_part_upload_size = 10000, s3_truncate_on_insert = 1; +INSERT INTO times SELECT now(); + +SELECT max(t) - min(t) >= 1 FROM times; From 7448d1bb5d7c89a769c8906a0dd149a1e493c8ff Mon Sep 17 00:00:00 2001 From: serxa Date: Wed, 9 Nov 2022 14:58:41 +0000 Subject: [PATCH 10/78] add docs --- docs/en/engines/table-engines/integrations/s3.md | 5 +++++ docs/en/engines/table-engines/mergetree-family/mergetree.md | 4 ++++ src/Core/Settings.h | 4 ++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index db983ab9c68..484fd265c3d 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -127,6 +127,10 @@ The following settings can be set before query execution or placed into configur - `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `512Mb`. - `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`. - `s3_single_read_retries` — The maximum number of attempts during single read. Default value is `4`. +- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited). +- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`. +- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited). +- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration. @@ -142,6 +146,7 @@ The following settings can be specified in configuration file for given endpoint - `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be specified multiple times. - `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. Optional. - `max_single_read_retries` — The maximum number of attempts during single read. Default value is `4`. Optional. +- `max_put_rps`, `max_put_burst`, `max_get_rps` and `max_get_burst` - Throttling settings (see description above) to use for specific endpoint instead of per query. Optional. **Example:** diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 486baac2310..4fe286a2098 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -940,6 +940,10 @@ Optional parameters: - `cache_path` — Path on local FS where to store cached mark and index files. Default value is `/var/lib/clickhouse/disks//cache/`. - `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`. - `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. +- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited). +- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`. +- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited). +- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. S3 disk can be configured as `main` or `cold` storage: ``` xml diff --git a/src/Core/Settings.h b/src/Core/Settings.h index e36ff1985fb..2f8aca6c628 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -90,9 +90,9 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ - M(UInt64, s3_max_get_rps, 0, "Limit on S3 GET request per second rate. Zero means unlimited.", 0) \ + M(UInt64, s3_max_get_rps, 0, "Limit on S3 GET request per second rate before throttling. Zero means unlimited.", 0) \ M(UInt64, s3_max_get_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_get_rps`", 0) \ - M(UInt64, s3_max_put_rps, 0, "Limit on S3 PUT request per second rate. Zero means unlimited.", 0) \ + M(UInt64, s3_max_put_rps, 0, "Limit on S3 PUT request per second rate before throttling. Zero means unlimited.", 0) \ M(UInt64, s3_max_put_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_put_rps`", 0) \ M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \ M(Bool, s3_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in s3 engine tables", 0) \ From 839de0cd092420356846c6a53468e603f4200594 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 10 Nov 2022 16:03:11 +0800 Subject: [PATCH 11/78] add function factorial --- .../sql-reference/functions/math-functions.md | 30 +++++++++++++++++++ .../00087_math_functions.reference | 3 ++ .../0_stateless/00087_math_functions.sql | 7 +++++ 3 files changed, 40 insertions(+) diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 430762a1885..bfd2f63dc5b 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -549,3 +549,33 @@ Result: │ 3.141592653589793 │ └───────────────────┘ ``` + + +## factorial(n) + +Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is Int64. + +The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20; a value of 21 or greater overflows the range for Int64 and will causes an throw exception. + + +**Syntax** + +``` sql +factorial(n) +``` + +**Example** + +Query: + +``` sql +SELECT factorial(10); +``` + +Result: + +``` text +┌─factorial(10)─┐ +│ 3628800 │ +└───────────────┘ +``` diff --git a/tests/queries/0_stateless/00087_math_functions.reference b/tests/queries/0_stateless/00087_math_functions.reference index e02aac11faf..48cb24098ff 100644 --- a/tests/queries/0_stateless/00087_math_functions.reference +++ b/tests/queries/0_stateless/00087_math_functions.reference @@ -112,3 +112,6 @@ 1 1 1 +1 +1 +1 diff --git a/tests/queries/0_stateless/00087_math_functions.sql b/tests/queries/0_stateless/00087_math_functions.sql index e40acfb3481..11456944be3 100644 --- a/tests/queries/0_stateless/00087_math_functions.sql +++ b/tests/queries/0_stateless/00087_math_functions.sql @@ -137,3 +137,10 @@ select erf(10) = 1; select erfc(0) = 1; select erfc(-10) = 2; select erfc(28) = 0; + +select factorial(-1) = 1; +select factorial(0) = 1; +select factorial(10) = 3628800; +select factorial(100); -- { serverError 36 } +select factorial('100'); -- { serverError 43 } +select factorial(100.1234); -- { serverError 43 } From d76c271242b9f4a5ce025d6096857bb2d84adeff Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 10 Nov 2022 16:08:22 +0800 Subject: [PATCH 12/78] add missed file --- src/Functions/factorial.cpp | 103 ++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 src/Functions/factorial.cpp diff --git a/src/Functions/factorial.cpp b/src/Functions/factorial.cpp new file mode 100644 index 00000000000..d92cb668214 --- /dev/null +++ b/src/Functions/factorial.cpp @@ -0,0 +1,103 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; +} + +template +struct FactorialImpl +{ + using ResultType = Int64; + static const constexpr bool allow_decimal = false; + static const constexpr bool allow_fixed_string = false; + static const constexpr bool allow_string_integer = false; + + static inline NO_SANITIZE_UNDEFINED ResultType apply(A a) + { + if constexpr (std::is_floating_point_v || is_over_big_int) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of argument of function factorial, should not be floating point or big int"); + + if constexpr (is_integer) + { + if (a > 20) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The maximum value for the input argument of function factorial is 20"); + + if constexpr (is_unsigned_v) + return factorials[a]; + else if constexpr (is_signed_v) + return a >= 0 ? factorials[a] : 1; + } + } + +#if USE_EMBEDDED_COMPILER + static constexpr bool compilable = false; /// special type handling, some other time +#endif + +private: + static const constexpr ResultType factorials[21] + = {1, + 1, + 2, + 6, + 24, + 120, + 720, + 5040, + 40320, + 362880, + 3628800, + 39916800, + 479001600, + 6227020800L, + 87178291200L, + 1307674368000L, + 20922789888000L, + 355687428096000L, + 6402373705728000L, + 121645100408832000L, + 2432902008176640000L}; +}; + +struct NameFactorial { static constexpr auto name = "factorial"; }; +using FunctionFactorial = FunctionUnaryArithmetic; + +template <> struct FunctionUnaryArithmeticMonotonicity +{ + static bool has() { return true; } + + static IFunction::Monotonicity get(const Field & /*left*/, const Field & /*right*/) + { + return { + .is_monotonic = true, + .is_positive = true, + .is_always_monotonic = true, + .is_strict = false, + }; + } +}; + + +REGISTER_FUNCTION(Factorial) +{ + factory.registerFunction( + { + R"( +Computes the factorial of an integer value. It works with any native integer type. The return type is Int64. + +The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20; a value of 21 or greater overflows the range for Int64 and will causes an throw exception. +)", + Documentation::Examples{{"factorial", "SELECT factorial(10)"}}, + Documentation::Categories{"Mathematical"}}, + FunctionFactory::CaseInsensitive); +} + +} From a1fd2752094f9ccdd01f134e46b0cc0ff68b2c23 Mon Sep 17 00:00:00 2001 From: Alfonso Martinez Date: Thu, 10 Nov 2022 09:13:52 +0100 Subject: [PATCH 13/78] Fixed duplicated includes --- src/Functions/FunctionsJSON.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index a89a2b24b9e..b4b15a25047 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -21,11 +21,6 @@ #include #include -#include -#include -#include -#include -#include #include #include #include From c12c1cdf9b1dafbb117c008a6e7fb4ee8e789087 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 7 Nov 2022 21:30:41 +0100 Subject: [PATCH 14/78] Allow release only from ready commits --- tests/ci/release.py | 49 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/tests/ci/release.py b/tests/ci/release.py index fd4bda3eae4..8024091e300 100755 --- a/tests/ci/release.py +++ b/tests/ci/release.py @@ -1,5 +1,14 @@ #!/usr/bin/env python +""" +script to create releases for ClickHouse + +The `gh` CLI prefered over the PyGithub to have an easy way to rollback bad +release in command line by simple execution giving rollback commands + +On another hand, PyGithub is used for convenient getting commit's status from API +""" + from contextlib import contextmanager from typing import List, Optional @@ -8,6 +17,8 @@ import logging import subprocess from git_helper import commit, release_branch +from github_helper import GitHub +from mark_release_ready import RELEASE_READY_STATUS from version_helper import ( FILE_WITH_VERSION_PATH, GENERATED_CONTRIBUTORS, @@ -67,12 +78,12 @@ class Release: self._release_branch = "" self._rollback_stack = [] # type: List[str] - def run(self, cmd: str, cwd: Optional[str] = None) -> str: + def run(self, cmd: str, cwd: Optional[str] = None, **kwargs) -> str: cwd_text = "" if cwd: cwd_text = f" (CWD='{cwd}')" logging.info("Running command%s:\n %s", cwd_text, cmd) - return self._git.run(cmd, cwd) + return self._git.run(cmd, cwd, **kwargs) def set_release_branch(self): # Fetch release commit in case it does not exist locally @@ -94,6 +105,38 @@ class Release: return VersionType.LTS return VersionType.STABLE + def check_commit_release_ready(self): + # First, get the auth token from gh cli + auth_status = self.run( + "gh auth status -t", stderr=subprocess.STDOUT + ).splitlines() + token = "" + for line in auth_status: + if "✓ Token:" in line: + token = line.split()[-1] + if not token: + logging.error("Can not extract token from `gh auth`") + raise subprocess.SubprocessError("Can not extract token from `gh auth`") + gh = GitHub(token, per_page=100) + repo = gh.get_repo(str(self.repo)) + + # Statuses are ordered by descending updated_at, so the first necessary + # status in the list is the most recent + statuses = repo.get_commit(self.release_commit).get_statuses() + for status in statuses: + if status.context == RELEASE_READY_STATUS: + if status.state == "success": + return + + raise Exception( + f"the status {RELEASE_READY_STATUS} is {status.state}, not success" + ) + + raise Exception( + f"the status {RELEASE_READY_STATUS} " + f"is not found for commit {self.release_commit}" + ) + def check_prerequisites(self): """ Check tooling installed in the system, `git` is checked by Git() init @@ -108,6 +151,8 @@ class Release: ) raise + self.check_commit_release_ready() + def do(self, check_dirty: bool, check_branch: bool, with_release_branch: bool): self.check_prerequisites() From cbdb2b0be471cba9722bc22efcca04bcac922fd4 Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 10 Nov 2022 12:19:06 +0000 Subject: [PATCH 15/78] add more asserts in test --- .../02477_s3_request_throttler.reference | 1 + .../0_stateless/02477_s3_request_throttler.sql | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02477_s3_request_throttler.reference b/tests/queries/0_stateless/02477_s3_request_throttler.reference index d00491fd7e5..9315e86b328 100644 --- a/tests/queries/0_stateless/02477_s3_request_throttler.reference +++ b/tests/queries/0_stateless/02477_s3_request_throttler.reference @@ -1 +1,2 @@ 1 +1 1 1 diff --git a/tests/queries/0_stateless/02477_s3_request_throttler.sql b/tests/queries/0_stateless/02477_s3_request_throttler.sql index 7311c8ac9bc..25653b1bab0 100644 --- a/tests/queries/0_stateless/02477_s3_request_throttler.sql +++ b/tests/queries/0_stateless/02477_s3_request_throttler.sql @@ -7,9 +7,20 @@ SET s3_max_put_burst = 1; CREATE TEMPORARY TABLE times (t DateTime); --- INSERT query requires 3 PUT requests (CreateMultipartUpload, UploadPart, CompleteMultipartUpload) and 1/rps = 0.5 second in between, the first query is not throttled due to burst +-- INSERT query requires 3 PUT requests and 1/rps = 0.5 second in between, the first query is not throttled due to burst INSERT INTO times SELECT now(); INSERT INTO TABLE FUNCTION s3('http://localhost:11111/test/request-throttler.csv', 'test', 'testtest', 'CSV', 'number UInt64') SELECT number FROM numbers(1000000) SETTINGS s3_max_single_part_upload_size = 10000, s3_truncate_on_insert = 1; INSERT INTO times SELECT now(); SELECT max(t) - min(t) >= 1 FROM times; + +SYSTEM FLUSH LOGS; +SELECT ProfileEvents['S3CreateMultipartUpload'] == 1, + ProfileEvents['S3UploadPart'] == 1, + ProfileEvents['S3CompleteMultipartUpload'] == 1 +FROM system.query_log +WHERE query LIKE '%request-throttler.csv%' +AND type = 'QueryFinish' +AND current_database = currentDatabase() +ORDER BY query_start_time DESC +LIMIT 1; From f58e960a7dbe71b54d69ec06d9d54967d0d97ea8 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 10 Nov 2022 21:07:44 +0800 Subject: [PATCH 16/78] update as request --- .../sql-reference/functions/math-functions.md | 2 +- src/Functions/factorial.cpp | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index bfd2f63dc5b..7f349686d8b 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -555,7 +555,7 @@ Result: Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is Int64. -The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20; a value of 21 or greater overflows the range for Int64 and will causes an throw exception. +The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater overflows the range for Int64 and will cause exception throw. **Syntax** diff --git a/src/Functions/factorial.cpp b/src/Functions/factorial.cpp index d92cb668214..5c46b97c193 100644 --- a/src/Functions/factorial.cpp +++ b/src/Functions/factorial.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace DB { @@ -74,13 +75,22 @@ template <> struct FunctionUnaryArithmeticMonotonicity { static bool has() { return true; } - static IFunction::Monotonicity get(const Field & /*left*/, const Field & /*right*/) + static IFunction::Monotonicity get(const Field & left, const Field & right) { + bool is_strict = false; + if (!left.isNull() && !right.isNull()) + { + auto left_value = applyVisitor(FieldVisitorConvertToNumber(), left); + auto right_value = applyVisitor(FieldVisitorConvertToNumber(), left); + if (1 <= left_value && left_value <= right_value && right_value <= 20) + is_strict = true; + } + return { .is_monotonic = true, .is_positive = true, .is_always_monotonic = true, - .is_strict = false, + .is_strict = is_strict, }; } }; @@ -91,9 +101,9 @@ REGISTER_FUNCTION(Factorial) factory.registerFunction( { R"( -Computes the factorial of an integer value. It works with any native integer type. The return type is Int64. +Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is Int64. -The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20; a value of 21 or greater overflows the range for Int64 and will causes an throw exception. +The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater overflows the range for Int64 and will cause exception throw. )", Documentation::Examples{{"factorial", "SELECT factorial(10)"}}, Documentation::Categories{"Mathematical"}}, From 8a7c7bfb3df3084d01c621e34338b7363d029cc8 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 10 Nov 2022 21:12:55 +0800 Subject: [PATCH 17/78] extract test of function factorial --- tests/queries/0_stateless/00087_math_functions.reference | 3 --- tests/queries/0_stateless/00087_math_functions.sql | 7 ------- tests/queries/0_stateless/02478_factorial.reference | 3 +++ tests/queries/0_stateless/02478_factorial.sql | 7 +++++++ 4 files changed, 10 insertions(+), 10 deletions(-) create mode 100644 tests/queries/0_stateless/02478_factorial.reference create mode 100644 tests/queries/0_stateless/02478_factorial.sql diff --git a/tests/queries/0_stateless/00087_math_functions.reference b/tests/queries/0_stateless/00087_math_functions.reference index 48cb24098ff..e02aac11faf 100644 --- a/tests/queries/0_stateless/00087_math_functions.reference +++ b/tests/queries/0_stateless/00087_math_functions.reference @@ -112,6 +112,3 @@ 1 1 1 -1 -1 -1 diff --git a/tests/queries/0_stateless/00087_math_functions.sql b/tests/queries/0_stateless/00087_math_functions.sql index 11456944be3..e40acfb3481 100644 --- a/tests/queries/0_stateless/00087_math_functions.sql +++ b/tests/queries/0_stateless/00087_math_functions.sql @@ -137,10 +137,3 @@ select erf(10) = 1; select erfc(0) = 1; select erfc(-10) = 2; select erfc(28) = 0; - -select factorial(-1) = 1; -select factorial(0) = 1; -select factorial(10) = 3628800; -select factorial(100); -- { serverError 36 } -select factorial('100'); -- { serverError 43 } -select factorial(100.1234); -- { serverError 43 } diff --git a/tests/queries/0_stateless/02478_factorial.reference b/tests/queries/0_stateless/02478_factorial.reference new file mode 100644 index 00000000000..e8183f05f5d --- /dev/null +++ b/tests/queries/0_stateless/02478_factorial.reference @@ -0,0 +1,3 @@ +1 +1 +1 diff --git a/tests/queries/0_stateless/02478_factorial.sql b/tests/queries/0_stateless/02478_factorial.sql new file mode 100644 index 00000000000..e1a0f7d60e5 --- /dev/null +++ b/tests/queries/0_stateless/02478_factorial.sql @@ -0,0 +1,7 @@ +select factorial(-1) = 1; +select factorial(0) = 1; +select factorial(10) = 3628800; + +select factorial(100); -- { serverError 36 } +select factorial('100'); -- { serverError 43 } +select factorial(100.1234); -- { serverError 43 } From 5d53ec9da88b0095fe693f441b85186ec792bbf2 Mon Sep 17 00:00:00 2001 From: Alfonso Martinez Date: Fri, 11 Nov 2022 11:07:05 +0100 Subject: [PATCH 18/78] Cleaned and replace some Strings by using chars directly --- src/Functions/FunctionsJSON.cpp | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index b4b15a25047..f8d5f357549 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -893,9 +893,17 @@ struct JSONExtractTree // For the non low cardinality case of FixedString, the padding is done in the FixedString Column implementation. // In order to avoid having to pass the data to a FixedString Column and read it back (which would slow down the execution) // the data is padded here and written directly to the Low Cardinality Column - auto padded_str = str.data() + std::string(fixed_length - std::min(fixed_length, str.length()), '\0'); + if (str.size() == fixed_length) + { + assert_cast(dest).insertData(str.data(), str.size()); + } + else + { + String padded_str(str); + padded_str.resize(fixed_length, '\0'); - assert_cast(dest).insertData(padded_str.data(), padded_str.size()); + assert_cast(dest).insertData(padded_str.data(), padded_str.size()); + } return true; } @@ -1230,8 +1238,7 @@ struct JSONExtractTree auto fixed_length = typeid_cast(dictionary_type.get())->getN(); return std::make_unique(fixed_length); } - auto impl = build(function_name, dictionary_type); - return impl; + return build(function_name, dictionary_type); } case TypeIndex::Decimal256: return std::make_unique>(type); case TypeIndex::Decimal128: return std::make_unique>(type); @@ -1387,12 +1394,9 @@ public: { ColumnString::Chars chars; WriteBufferFromVector buf(chars, AppendModeTag()); - chars.push_back(0); traverse(element, buf); buf.finalize(); - std::string str = reinterpret_cast(chars.data()); - chars.push_back(0); - assert_cast(dest).insertData(str.data(), str.size()); + assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); } else { @@ -1423,7 +1427,6 @@ public: chars.push_back(0); std::string str = reinterpret_cast(chars.data()); - auto padded_str = str + std::string(col_str.getN() - std::min(col_str.getN(), str.length()), '\0'); col_str.insertData(str.data(), str.size()); @@ -1441,10 +1444,11 @@ public: traverse(element, buf); buf.finalize(); chars.push_back(0); - std::string str = reinterpret_cast(chars.data()); - auto padded_str = str + std::string(fixed_length - std::min(fixed_length, str.length()), '\0'); - assert_cast(dest).insertData(padded_str.data(), padded_str.size()); + for (unsigned long i = 0; i < fixed_length - chars.size(); ++i) + chars.push_back(0); + + assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); return true; } From f24991fa56f93e1ff55e37069e0aa58ab9380303 Mon Sep 17 00:00:00 2001 From: Alfonso Martinez Date: Fri, 11 Nov 2022 13:02:30 +0100 Subject: [PATCH 19/78] Remove pushing a last 0 in FixedStrings --- src/Functions/FunctionsJSON.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index f8d5f357549..0973c1f3e39 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -1424,7 +1424,6 @@ public: if (chars.size() > col_str.getN()) return false; - chars.push_back(0); std::string str = reinterpret_cast(chars.data()); col_str.insertData(str.data(), str.size()); @@ -1443,7 +1442,6 @@ public: WriteBufferFromVector buf(chars, AppendModeTag()); traverse(element, buf); buf.finalize(); - chars.push_back(0); for (unsigned long i = 0; i < fixed_length - chars.size(); ++i) chars.push_back(0); From ad377b357f0dbd3a9b9671d5828b6d6e34963057 Mon Sep 17 00:00:00 2001 From: serxa Date: Fri, 11 Nov 2022 13:24:43 +0000 Subject: [PATCH 20/78] fix backup tests --- src/Backups/BackupIO_S3.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 5a4e804a778..8342749e230 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -89,6 +89,7 @@ BackupReaderS3::BackupReaderS3( , read_settings(context_->getReadSettings()) , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) { + request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint } DataSourceDescription BackupReaderS3::getDataSourceDescription() const @@ -128,6 +129,7 @@ BackupWriterS3::BackupWriterS3( , log(&Poco::Logger::get("BackupWriterS3")) { request_settings.updateFromSettingsIfEmpty(context_->getSettingsRef()); + request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint } DataSourceDescription BackupWriterS3::getDataSourceDescription() const From b12ebab65c5e1b084a61900b61094ab0b515a886 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Fri, 11 Nov 2022 18:07:15 +0000 Subject: [PATCH 21/78] Enable keeper fault injection and retries for insert queries in funcntional tests --- docker/test/stress/run.sh | 3 +++ tests/config/users.d/insert_keeper_retries.xml | 8 ++++++++ 2 files changed, 11 insertions(+) create mode 100644 tests/config/users.d/insert_keeper_retries.xml diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 78f627bf45e..36b1cd1ef0d 100644 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -388,6 +388,9 @@ else rm -f /etc/clickhouse-server/config.d/storage_conf.xml ||: rm -f /etc/clickhouse-server/config.d/azure_storage_conf.xml ||: + # it uses recently introduced settings which previous versions may not have + rm -f /etc/clickhouse-server/users.d/insert_keeper_retries.xml ||: + start clickhouse-client --query="SELECT 'Server version: ', version()" diff --git a/tests/config/users.d/insert_keeper_retries.xml b/tests/config/users.d/insert_keeper_retries.xml new file mode 100644 index 00000000000..462c9df5248 --- /dev/null +++ b/tests/config/users.d/insert_keeper_retries.xml @@ -0,0 +1,8 @@ + + + + 20 + 0.01 + + + From 986e9a60daa24ac6682e92ccf0247969d1dc1029 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 10 Nov 2022 13:03:54 +0100 Subject: [PATCH 22/78] Analyzer aggregation totals crash fix --- src/Planner/Planner.cpp | 22 +++++++++---------- ...regation_totals_rollup_crash_fix.reference | 8 +++++++ ...er_aggregation_totals_rollup_crash_fix.sql | 5 +++++ 3 files changed, 24 insertions(+), 11 deletions(-) create mode 100644 tests/queries/0_stateless/02479_analyzer_aggregation_totals_rollup_crash_fix.reference create mode 100644 tests/queries/0_stateless/02479_analyzer_aggregation_totals_rollup_crash_fix.sql diff --git a/src/Planner/Planner.cpp b/src/Planner/Planner.cpp index 56cc73456ce..28be1a83088 100644 --- a/src/Planner/Planner.cpp +++ b/src/Planner/Planner.cpp @@ -498,17 +498,6 @@ void Planner::buildQueryPlanIfNeeded() should_produce_results_in_order_of_bucket_number); query_plan.addStep(std::move(aggregating_step)); - if (query_node.isGroupByWithRollup()) - { - auto rollup_step = std::make_unique(query_plan.getCurrentDataStream(), std::move(aggregator_params), true /*final*/, settings.group_by_use_nulls); - query_plan.addStep(std::move(rollup_step)); - } - else if (query_node.isGroupByWithCube()) - { - auto cube_step = std::make_unique(query_plan.getCurrentDataStream(), std::move(aggregator_params), true /*final*/, settings.group_by_use_nulls); - query_plan.addStep(std::move(cube_step)); - } - if (query_node.isGroupByWithTotals()) { const auto & having_analysis_result = expression_analysis_result.getHaving(); @@ -528,6 +517,17 @@ void Planner::buildQueryPlanIfNeeded() query_plan.addStep(std::move(totals_having_step)); } + + if (query_node.isGroupByWithRollup()) + { + auto rollup_step = std::make_unique(query_plan.getCurrentDataStream(), std::move(aggregator_params), true /*final*/, settings.group_by_use_nulls); + query_plan.addStep(std::move(rollup_step)); + } + else if (query_node.isGroupByWithCube()) + { + auto cube_step = std::make_unique(query_plan.getCurrentDataStream(), std::move(aggregator_params), true /*final*/, settings.group_by_use_nulls); + query_plan.addStep(std::move(cube_step)); + } } if (!having_executed && expression_analysis_result.hasHaving()) diff --git a/tests/queries/0_stateless/02479_analyzer_aggregation_totals_rollup_crash_fix.reference b/tests/queries/0_stateless/02479_analyzer_aggregation_totals_rollup_crash_fix.reference new file mode 100644 index 00000000000..7c5d87e1389 --- /dev/null +++ b/tests/queries/0_stateless/02479_analyzer_aggregation_totals_rollup_crash_fix.reference @@ -0,0 +1,8 @@ +0 +0 + +0 +((0.0001)) 0 +((0.0001)) 0 + +((0.0001)) 0 diff --git a/tests/queries/0_stateless/02479_analyzer_aggregation_totals_rollup_crash_fix.sql b/tests/queries/0_stateless/02479_analyzer_aggregation_totals_rollup_crash_fix.sql new file mode 100644 index 00000000000..6cd3e6a9385 --- /dev/null +++ b/tests/queries/0_stateless/02479_analyzer_aggregation_totals_rollup_crash_fix.sql @@ -0,0 +1,5 @@ +SET allow_experimental_analyzer = 1; + +SELECT anyLast(number) FROM numbers(1) GROUP BY number WITH ROLLUP WITH TOTALS; + +SELECT tuple(tuple(0.0001)), anyLast(number) FROM numbers(1) GROUP BY number WITH ROLLUP WITH TOTALS; From 769cef94588c8e0e97a807ae1ede657561fdfdfe Mon Sep 17 00:00:00 2001 From: Alfonso Martinez Date: Mon, 14 Nov 2022 11:51:50 +0100 Subject: [PATCH 23/78] Replaced str with chars directly --- src/Functions/FunctionsJSON.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index 0973c1f3e39..2234c582ba6 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -1424,9 +1424,8 @@ public: if (chars.size() > col_str.getN()) return false; - std::string str = reinterpret_cast(chars.data()); - - col_str.insertData(str.data(), str.size()); + chars.resize_fill(col_str.getN()); + col_str.insertData(reinterpret_cast(chars.data()), chars.size()); return true; @@ -1443,9 +1442,9 @@ public: traverse(element, buf); buf.finalize(); - for (unsigned long i = 0; i < fixed_length - chars.size(); ++i) - chars.push_back(0); - + if (chars.size() > fixed_length) + return false; + chars.resize_fill(fixed_length); assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); return true; From e18c97faf7e85e31003cb8d009e1aa49fd52eb48 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sat, 5 Nov 2022 14:09:43 +0100 Subject: [PATCH 24/78] Remove dependencies maskSensitiveInfo() from Context. --- src/Backups/BackupInfo.cpp | 4 +- src/Backups/BackupInfo.h | 2 +- src/Backups/BackupsWorker.cpp | 4 +- src/Backups/registerBackupEngineS3.cpp | 2 +- .../registerBackupEnginesFileAndDisk.cpp | 2 +- src/Interpreters/DDLTask.cpp | 4 +- src/Interpreters/executeQuery.cpp | 4 +- .../maskSensitiveInfoInQueryForLogging.cpp | 161 +++++++----------- .../maskSensitiveInfoInQueryForLogging.h | 6 +- 9 files changed, 73 insertions(+), 116 deletions(-) diff --git a/src/Backups/BackupInfo.cpp b/src/Backups/BackupInfo.cpp index 12bf1f91e86..a10bfb4e82a 100644 --- a/src/Backups/BackupInfo.cpp +++ b/src/Backups/BackupInfo.cpp @@ -93,10 +93,10 @@ BackupInfo BackupInfo::fromAST(const IAST & ast) } -String BackupInfo::toStringForLogging(const ContextPtr & context) const +String BackupInfo::toStringForLogging() const { ASTPtr ast = toAST(); - return maskSensitiveInfoInBackupNameForLogging(serializeAST(*ast), ast, context); + return maskSensitiveInfoInBackupNameForLogging(serializeAST(*ast), ast); } } diff --git a/src/Backups/BackupInfo.h b/src/Backups/BackupInfo.h index 708238e3868..cf5d75bb0d0 100644 --- a/src/Backups/BackupInfo.h +++ b/src/Backups/BackupInfo.h @@ -22,7 +22,7 @@ struct BackupInfo ASTPtr toAST() const; static BackupInfo fromAST(const IAST & ast); - String toStringForLogging(const ContextPtr & context) const; + String toStringForLogging() const; }; } diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index e235a34a09a..5be2dde9a3d 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -166,7 +166,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context } auto backup_info = BackupInfo::fromAST(*backup_query->backup_name); - String backup_name_for_logging = backup_info.toStringForLogging(context); + String backup_name_for_logging = backup_info.toStringForLogging(); try { addInfo(backup_id, backup_name_for_logging, backup_settings.internal, BackupStatus::CREATING_BACKUP); @@ -388,7 +388,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt try { auto backup_info = BackupInfo::fromAST(*restore_query->backup_name); - String backup_name_for_logging = backup_info.toStringForLogging(context); + String backup_name_for_logging = backup_info.toStringForLogging(); addInfo(restore_id, backup_name_for_logging, restore_settings.internal, BackupStatus::RESTORING); /// Prepare context to use. diff --git a/src/Backups/registerBackupEngineS3.cpp b/src/Backups/registerBackupEngineS3.cpp index 7f5cb7f12a1..afd4bd2b21d 100644 --- a/src/Backups/registerBackupEngineS3.cpp +++ b/src/Backups/registerBackupEngineS3.cpp @@ -47,7 +47,7 @@ void registerBackupEngineS3(BackupFactory & factory) auto creator_fn = []([[maybe_unused]] const BackupFactory::CreateParams & params) -> std::unique_ptr { #if USE_AWS_S3 - String backup_name_for_logging = params.backup_info.toStringForLogging(params.context); + String backup_name_for_logging = params.backup_info.toStringForLogging(); const String & id_arg = params.backup_info.id_arg; const auto & args = params.backup_info.args; diff --git a/src/Backups/registerBackupEnginesFileAndDisk.cpp b/src/Backups/registerBackupEnginesFileAndDisk.cpp index 6a3e1669507..def9c5ba188 100644 --- a/src/Backups/registerBackupEnginesFileAndDisk.cpp +++ b/src/Backups/registerBackupEnginesFileAndDisk.cpp @@ -99,7 +99,7 @@ void registerBackupEnginesFileAndDisk(BackupFactory & factory) { auto creator_fn = [](const BackupFactory::CreateParams & params) -> std::unique_ptr { - String backup_name_for_logging = params.backup_info.toStringForLogging(params.context); + String backup_name_for_logging = params.backup_info.toStringForLogging(); const String & engine_name = params.backup_info.backup_engine_name; if (!params.backup_info.id_arg.empty()) diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 980e8f6e7b6..9bcc086d6c7 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -170,11 +170,11 @@ void DDLTaskBase::parseQueryFromEntry(ContextPtr context) query = parseQuery(parser_query, begin, end, description, 0, settings.max_parser_depth); } -void DDLTaskBase::formatRewrittenQuery(ContextPtr context) +void DDLTaskBase::formatRewrittenQuery(ContextPtr) { /// Convert rewritten AST back to string. query_str = queryToString(*query); - query_for_logging = maskSensitiveInfoInQueryForLogging(query_str, query, context); + query_for_logging = maskSensitiveInfoInQueryForLogging(query_str, query); } ContextMutablePtr DDLTaskBase::makeQueryContext(ContextPtr from_context, const ZooKeeperPtr & /*zookeeper*/) diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 017485cf76c..a6f3a38303c 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -392,14 +392,14 @@ static std::tuple executeQueryImpl( /// MUST go before any modification (except for prepared statements, /// since it substitute parameters and without them query does not contain /// parameters), to keep query as-is in query_log and server log. - query_for_logging = maskSensitiveInfoInQueryForLogging(query, ast, context); + query_for_logging = maskSensitiveInfoInQueryForLogging(query, ast); } catch (...) { /// Anyway log the query. if (query.empty()) query.assign(begin, std::min(end - begin, static_cast(max_query_size))); - query_for_logging = maskSensitiveInfoInQueryForLogging(query, ast, context); + query_for_logging = maskSensitiveInfoInQueryForLogging(query, ast); logQuery(query_for_logging, context, internal, stage); diff --git a/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp b/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp index fe05283eef5..6b2c865d166 100644 --- a/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp +++ b/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp @@ -1,12 +1,11 @@ #include #include -#include #include -#include #include #include #include +#include #include #include #include @@ -45,7 +44,6 @@ namespace bool is_create_table_query = false; bool is_create_database_query = false; bool is_create_dictionary_query = false; - ContextPtr context; PasswordWipingMode mode = PasswordWipingMode::Query; }; @@ -272,8 +270,13 @@ namespace /// We need to distinguish that from s3('url', 'format', 'structure' [, 'compression_method']). /// So we will check whether the argument after 'url' is a format. String format; - if (!tryGetEvaluatedConstStringFromArgument(function, data, url_arg_idx + 1, &format)) + if (!tryGetStringFromArgument(function, url_arg_idx + 1, &format, /* allow_literal= */ true, /* allow_identifier= */ false)) + { + /// We couldn't evaluate the argument after 'url' so we don't know whether it is a format or `aws_access_key_id`. + /// So it's safer to wipe the next argument just in case. + wipePasswordFromArgument(function, data, url_arg_idx + 2); /// Wipe either `aws_secret_access_key` or `structure`. return; + } if (FormatFactory::instance().getAllFormats().contains(format)) return; /// The argument after 'url' is a format: s3('url', 'format', ...) @@ -309,27 +312,36 @@ namespace else { String database; - if (!tryGetEvaluatedConstDatabaseNameFromArgument(function, data, arg_num, &database)) + if (!tryGetStringFromArgument(function, arg_num, &database, /* allow_literal= */ true, /* allow_identifier= */ true)) + { + /// We couldn't evaluate the argument so we don't know whether it is 'db.table' or just 'db'. + /// Hence we can't figure out whether we should skip one argument 'user' or two arguments 'table', 'user' + /// before the argument 'password'. So it's safer to wipe two arguments just in case. + /// The last argument can be also a `sharding_key`, so we need to check that argument is a literal string + /// before wiping it (because the `password` argument is always a literal string). + if (tryGetStringFromArgument(function, arg_num + 2, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false)) + wipePasswordFromArgument(function, data, arg_num + 2); /// Wipe either `password` or `user`. + if (tryGetStringFromArgument(function, arg_num + 3, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false)) + wipePasswordFromArgument(function, data, arg_num + 3); /// Wipe either `password` or `sharding_key`. return; - ++arg_num; + } + ++arg_num; auto qualified_name = QualifiedTableName::parseFromString(database); if (qualified_name.database.empty()) ++arg_num; /// skip 'table' argument } - /// Check if username and password are specified - /// (sharding_key can be of any type so while we're getting string literals they're username & password). - String username, password; - bool username_specified = tryGetStringFromArgument(function, arg_num, &username); - bool password_specified = username_specified && tryGetStringFromArgument(function, arg_num + 1, &password); + /// Skip username. + ++arg_num; - if (password_specified) - { - /// Password is specified so we do our replacement: - /// remote('addresses_expr', db.table, 'user', 'password', ...) -> remote('addresses_expr', db.table, 'user', '[HIDDEN]', ...) - wipePasswordFromArgument(function, data, arg_num + 1); - } + /// Do our replacement: + /// remote('addresses_expr', db.table, 'user', 'password', ...) -> remote('addresses_expr', db.table, 'user', '[HIDDEN]', ...) + /// The last argument can be also a `sharding_key`, so we need to check that argument is a literal string + /// before wiping it (because the `password` argument is always a literal string). + bool can_be_password = tryGetStringFromArgument(function, arg_num, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false); + if (can_be_password) + wipePasswordFromArgument(function, data, arg_num); } static void wipePasswordFromEncryptionFunctionArguments(ASTFunction & function, Data & data) @@ -410,7 +422,7 @@ namespace data.password_was_hidden = true; } - static bool tryGetNumArguments(const ASTFunction & function, size_t * num_arguments) + static bool tryGetNumArguments(const ASTFunction & function, size_t * res) { if (!function.arguments) return false; @@ -420,11 +432,13 @@ namespace return false; /// return false because we don't want to validate query here const auto & arguments = expr_list->children; - *num_arguments = arguments.size(); + if (res) + *res = arguments.size(); return true; } - static bool tryGetStringFromArgument(const ASTFunction & function, size_t arg_idx, String * value) + static bool + tryGetStringFromArgument(const ASTFunction & function, size_t arg_idx, String * res, bool allow_literal, bool allow_identifier) { if (!function.arguments) return false; @@ -437,86 +451,30 @@ namespace if (arg_idx >= arguments.size()) return false; - const auto * literal = arguments[arg_idx]->as(); - if (!literal || literal->value.getType() != Field::Types::String) - return false; - - *value = literal->value.safeGet(); - return true; - } - - static bool tryGetEvaluatedConstStringFromArgument(const ASTFunction & function, Data & data, size_t arg_idx, String * value) - { - if (!function.arguments) - return false; - - const auto * expr_list = function.arguments->as(); - if (!expr_list) - return false; /// return false because we don't want to validate query here - - const auto & arguments = expr_list->children; - if (arg_idx >= arguments.size()) - return false; - - if constexpr (check_only) - { - data.can_contain_password = true; - return false; - } - ASTPtr argument = arguments[arg_idx]; - try + if (allow_literal) { - argument = evaluateConstantExpressionOrIdentifierAsLiteral(argument, data.context); - } - catch (...) - { - return false; + if (const auto * literal = argument->as()) + { + if (literal->value.getType() != Field::Types::String) + return false; + if (res) + *res = literal->value.safeGet(); + return true; + } } - const auto & literal = assert_cast(*argument); - if (literal.value.getType() != Field::Types::String) - return false; - - *value = literal.value.safeGet(); - return true; - } - - static bool tryGetEvaluatedConstDatabaseNameFromArgument(const ASTFunction & function, Data & data, size_t arg_idx, String * value) - { - if (!function.arguments) - return false; - - const auto * expr_list = function.arguments->as(); - if (!expr_list) - return false; /// return false because we don't want to validate query here - - const auto & arguments = expr_list->children; - if (arg_idx >= arguments.size()) - return false; - - if constexpr (check_only) + if (allow_identifier) { - data.can_contain_password = true; - return false; + if (const auto * id = argument->as()) + { + if (res) + *res = id->name(); + return true; + } } - ASTPtr argument = arguments[arg_idx]; - try - { - argument = evaluateConstantExpressionForDatabaseName(argument, data.context); - } - catch (...) - { - return false; - } - - const auto & literal = assert_cast(*argument); - if (literal.value.getType() != Field::Types::String) - return false; - - *value = literal.value.safeGet(); - return true; + return false; } static void visitDictionaryDef(ASTDictionary & dictionary, Data & data) @@ -567,11 +525,10 @@ namespace /// Removes a password or its hash from a query if it's specified there or replaces it with some placeholder. /// This function is used to prepare a query for storing in logs (we don't want logs to contain sensitive information). - bool wipePasswordFromQuery(ASTPtr ast, PasswordWipingMode mode, const ContextPtr & context) + bool wipePasswordFromQuery(ASTPtr ast, PasswordWipingMode mode) { using WipingVisitor = PasswordWipingVisitor; WipingVisitor::Data data; - data.context = context; data.mode = mode; WipingVisitor::Visitor visitor{data}; visitor.visit(ast); @@ -579,7 +536,7 @@ namespace } /// Common utility for masking sensitive information. - String maskSensitiveInfoImpl(const String & query, const ASTPtr & parsed_query, PasswordWipingMode mode, const ContextPtr & context) + String maskSensitiveInfoImpl(const String & query, const ASTPtr & parsed_query, PasswordWipingMode mode) { String res = query; @@ -587,7 +544,7 @@ namespace if (parsed_query && canContainPassword(*parsed_query, mode)) { ASTPtr ast_without_password = parsed_query->clone(); - if (wipePasswordFromQuery(ast_without_password, mode, context)) + if (wipePasswordFromQuery(ast_without_password, mode)) res = serializeAST(*ast_without_password); } @@ -602,22 +559,22 @@ namespace } } - res = res.substr(0, context->getSettingsRef().log_queries_cut_to_length); + //res = res.substr(0, context->getSettingsRef().log_queries_cut_to_length); return res; } } -String maskSensitiveInfoInQueryForLogging(const String & query, const ASTPtr & parsed_query, const ContextPtr & context) +String maskSensitiveInfoInQueryForLogging(const String & query, const ASTPtr & parsed_query) { - return maskSensitiveInfoImpl(query, parsed_query, PasswordWipingMode::Query, context); + return maskSensitiveInfoImpl(query, parsed_query, PasswordWipingMode::Query); } -String maskSensitiveInfoInBackupNameForLogging(const String & backup_name, const ASTPtr & ast, const ContextPtr & context) +String maskSensitiveInfoInBackupNameForLogging(const String & backup_name, const ASTPtr & ast) { - return maskSensitiveInfoImpl(backup_name, ast, PasswordWipingMode::BackupName, context); + return maskSensitiveInfoImpl(backup_name, ast, PasswordWipingMode::BackupName); } } diff --git a/src/Interpreters/maskSensitiveInfoInQueryForLogging.h b/src/Interpreters/maskSensitiveInfoInQueryForLogging.h index 3892f89bc52..6284b9a9294 100644 --- a/src/Interpreters/maskSensitiveInfoInQueryForLogging.h +++ b/src/Interpreters/maskSensitiveInfoInQueryForLogging.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB @@ -10,10 +10,10 @@ namespace DB /// Makes a version of a query without sensitive information (e.g. passwords) for logging. /// The parameter `parsed query` is allowed to be nullptr if the query cannot be parsed. /// Does not validate AST, works a best-effort way. -String maskSensitiveInfoInQueryForLogging(const String & query, const ASTPtr & parsed_query, const ContextPtr & context); +String maskSensitiveInfoInQueryForLogging(const String & query, const ASTPtr & parsed_query); /// Makes a version of backup name without sensitive information (e.g. passwords) for logging. /// Does not validate AST, works a best-effort way. -String maskSensitiveInfoInBackupNameForLogging(const String & backup_name, const ASTPtr & ast, const ContextPtr & context); +String maskSensitiveInfoInBackupNameForLogging(const String & backup_name, const ASTPtr & ast); } From 8e99f5fea3e91c862437fee7ff616516f7e5b687 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sat, 5 Nov 2022 15:53:08 +0100 Subject: [PATCH 25/78] Move maskSensitiveInfoInQueryForLogging() to src/Parsers/ --- src/Backups/BackupInfo.cpp | 2 +- src/Backups/DDLAdjustingForBackupVisitor.h | 2 +- src/Common/KnownObjectNames.cpp | 48 +++++++++++++++++++ src/Common/KnownObjectNames.h | 37 ++++++++++++++ src/Databases/DDLDependencyVisitor.h | 2 +- src/Databases/DDLRenamingVisitor.cpp | 2 +- src/Databases/DDLRenamingVisitor.h | 2 +- src/Formats/FormatFactory.cpp | 3 ++ .../UserDefinedSQLFunctionVisitor.h | 2 +- src/Interpreters/ActionsVisitor.h | 2 +- .../AggregateFunctionOfGroupByKeysVisitor.h | 2 +- .../ArithmeticOperationsInAgrFuncOptimize.h | 2 +- src/Interpreters/ArrayJoinedColumnsVisitor.h | 2 +- src/Interpreters/CollectJoinOnKeysVisitor.h | 2 +- src/Interpreters/ColumnAliasesVisitor.h | 2 +- .../ConvertFunctionOrLikeVisitor.h | 2 +- .../ConvertStringsToEnumVisitor.h | 2 +- src/Interpreters/CrossToInnerJoinVisitor.h | 2 +- src/Interpreters/DDLTask.cpp | 2 +- src/Interpreters/DuplicateOrderByVisitor.h | 2 +- .../ExecuteScalarSubqueriesVisitor.h | 2 +- .../ExtractExpressionInfoVisitor.h | 2 +- .../FunctionMaskingArgumentCheckVisitor.h | 2 +- .../GatherFunctionQuantileVisitor.h | 2 +- src/Interpreters/GetAggregatesVisitor.h | 2 +- src/Interpreters/GlobalSubqueriesVisitor.h | 2 +- src/Interpreters/GroupByFunctionKeysVisitor.h | 2 +- .../GroupingSetsRewriterVisitor.h | 2 +- src/Interpreters/IdentifierSemantic.h | 2 +- .../InJoinSubqueriesPreprocessor.cpp | 2 +- src/Interpreters/InterpreterExplainQuery.cpp | 2 +- .../InterpreterSelectWithUnionQuery.cpp | 2 +- .../JoinToSubqueryTransformVisitor.h | 2 +- src/Interpreters/JoinedTables.cpp | 2 +- .../MarkTableIdentifiersVisitor.h | 2 +- src/Interpreters/MonotonicityCheckVisitor.h | 2 +- src/Interpreters/MutationsInterpreter.cpp | 2 +- .../NormalizeSelectWithUnionQueryVisitor.h | 2 +- src/Interpreters/OptimizeIfChains.h | 2 +- .../OptimizeShardingKeyRewriteInVisitor.h | 2 +- src/Interpreters/PredicateRewriteVisitor.h | 2 +- src/Interpreters/QueryAliasesVisitor.h | 2 +- .../RedundantFunctionsInOrderByVisitor.h | 2 +- .../RemoveInjectiveFunctionsVisitor.h | 2 +- src/Interpreters/RenameColumnVisitor.h | 2 +- .../RequiredSourceColumnsVisitor.h | 2 +- src/Interpreters/RewriteAnyFunctionVisitor.h | 2 +- .../RewriteCountDistinctVisitor.h | 2 +- .../RewriteFunctionToSubcolumnVisitor.h | 2 +- src/Interpreters/RewriteOrderByVisitor.hpp | 2 +- .../RewriteSumIfFunctionVisitor.h | 2 +- .../SelectIntersectExceptQueryVisitor.h | 2 +- .../SubstituteColumnOptimizer.cpp | 2 +- src/Interpreters/TableOverrideUtils.cpp | 2 +- .../TranslateQualifiedNamesVisitor.h | 2 +- src/Interpreters/executeQuery.cpp | 2 +- .../InDepthNodeVisitor.h | 0 .../InsertQuerySettingsPushDownVisitor.h | 2 +- .../QueryWithOutputSettingsPushDownVisitor.h | 2 +- .../maskSensitiveInfoInQueryForLogging.cpp | 11 ++--- .../maskSensitiveInfoInQueryForLogging.h | 0 src/Storages/TTLDescription.cpp | 2 +- src/Storages/WindowView/StorageWindowView.cpp | 2 +- .../transformQueryForExternalDatabase.cpp | 2 +- src/TableFunctions/TableFunctionFactory.cpp | 3 ++ 65 files changed, 154 insertions(+), 64 deletions(-) create mode 100644 src/Common/KnownObjectNames.cpp create mode 100644 src/Common/KnownObjectNames.h rename src/{Interpreters => Parsers}/InDepthNodeVisitor.h (100%) rename src/{Interpreters => Parsers}/maskSensitiveInfoInQueryForLogging.cpp (98%) rename src/{Interpreters => Parsers}/maskSensitiveInfoInQueryForLogging.h (100%) diff --git a/src/Backups/BackupInfo.cpp b/src/Backups/BackupInfo.cpp index a10bfb4e82a..009c262ad67 100644 --- a/src/Backups/BackupInfo.cpp +++ b/src/Backups/BackupInfo.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include namespace DB diff --git a/src/Backups/DDLAdjustingForBackupVisitor.h b/src/Backups/DDLAdjustingForBackupVisitor.h index 63353dcc000..8c8ac1c1ee5 100644 --- a/src/Backups/DDLAdjustingForBackupVisitor.h +++ b/src/Backups/DDLAdjustingForBackupVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include diff --git a/src/Common/KnownObjectNames.cpp b/src/Common/KnownObjectNames.cpp new file mode 100644 index 00000000000..45066fc93d4 --- /dev/null +++ b/src/Common/KnownObjectNames.cpp @@ -0,0 +1,48 @@ +#include +#include + + +namespace DB +{ + +bool KnownObjectNames::exists(const String & name) const +{ + std::lock_guard lock{mutex}; + if (names.contains(name)) + return true; + + if (case_insensitive_names.empty()) + { + String lower_name = Poco::toLower(name); + if (case_insensitive_names.contains(lower_name)) + return true; + } + + return false; +} + + +void KnownObjectNames::add(const String & name, bool case_insensitive) +{ + std::lock_guard lock{mutex}; + if (case_insensitive) + case_insensitive_names.emplace(Poco::toLower(name)); + else + names.emplace(name); +} + + +KnownTableFunctionNames & KnownTableFunctionNames::instance() +{ + static KnownTableFunctionNames the_instance; + return the_instance; +} + + +KnownFormatNames & KnownFormatNames::instance() +{ + static KnownFormatNames the_instance; + return the_instance; +} + +} diff --git a/src/Common/KnownObjectNames.h b/src/Common/KnownObjectNames.h new file mode 100644 index 00000000000..f003af69904 --- /dev/null +++ b/src/Common/KnownObjectNames.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +class KnownObjectNames +{ +public: + bool exists(const String & name) const; + void add(const String & name, bool case_insensitive = false); + +private: + mutable std::mutex mutex; + std::unordered_set names; + std::unordered_set case_insensitive_names; +}; + + +class KnownTableFunctionNames : public KnownObjectNames +{ +public: + static KnownTableFunctionNames & instance(); +}; + + +class KnownFormatNames : public KnownObjectNames +{ +public: + static KnownFormatNames & instance(); +}; + +} diff --git a/src/Databases/DDLDependencyVisitor.h b/src/Databases/DDLDependencyVisitor.h index d23a7a697a9..7f285af42cb 100644 --- a/src/Databases/DDLDependencyVisitor.h +++ b/src/Databases/DDLDependencyVisitor.h @@ -1,7 +1,7 @@ #pragma once #include #include -#include +#include namespace DB { diff --git a/src/Databases/DDLRenamingVisitor.cpp b/src/Databases/DDLRenamingVisitor.cpp index 7ea5dbeda83..443ae37d66c 100644 --- a/src/Databases/DDLRenamingVisitor.cpp +++ b/src/Databases/DDLRenamingVisitor.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Databases/DDLRenamingVisitor.h b/src/Databases/DDLRenamingVisitor.h index 44146a8ee6b..0dad002483f 100644 --- a/src/Databases/DDLRenamingVisitor.h +++ b/src/Databases/DDLRenamingVisitor.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index a882fcf5009..bf1168dd8a7 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -444,6 +445,7 @@ void FormatFactory::registerInputFormat(const String & name, InputCreator input_ if (target) throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); target = std::move(input_creator); + KnownFormatNames::instance().add(name); registerFileExtension(name, name); } @@ -453,6 +455,7 @@ void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name if (target) throw Exception("FormatFactory: Non trivial prefix and suffix checker " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); target = std::move(non_trivial_prefix_and_suffix_checker); + KnownFormatNames::instance().add(name); } void FormatFactory::registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker) diff --git a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h index 686594c088f..5fc98a42634 100644 --- a/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h +++ b/src/Functions/UserDefined/UserDefinedSQLFunctionVisitor.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB { diff --git a/src/Interpreters/ActionsVisitor.h b/src/Interpreters/ActionsVisitor.h index fea013fd075..20782baca6b 100644 --- a/src/Interpreters/ActionsVisitor.h +++ b/src/Interpreters/ActionsVisitor.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/AggregateFunctionOfGroupByKeysVisitor.h b/src/Interpreters/AggregateFunctionOfGroupByKeysVisitor.h index ab01cc6a0b6..447cbbd054a 100644 --- a/src/Interpreters/AggregateFunctionOfGroupByKeysVisitor.h +++ b/src/Interpreters/AggregateFunctionOfGroupByKeysVisitor.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/ArithmeticOperationsInAgrFuncOptimize.h b/src/Interpreters/ArithmeticOperationsInAgrFuncOptimize.h index 81d936aeba5..02a5259ad48 100644 --- a/src/Interpreters/ArithmeticOperationsInAgrFuncOptimize.h +++ b/src/Interpreters/ArithmeticOperationsInAgrFuncOptimize.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB { diff --git a/src/Interpreters/ArrayJoinedColumnsVisitor.h b/src/Interpreters/ArrayJoinedColumnsVisitor.h index b64a170cb00..c842b729518 100644 --- a/src/Interpreters/ArrayJoinedColumnsVisitor.h +++ b/src/Interpreters/ArrayJoinedColumnsVisitor.h @@ -9,7 +9,7 @@ #include #include -#include +#include #include #include diff --git a/src/Interpreters/CollectJoinOnKeysVisitor.h b/src/Interpreters/CollectJoinOnKeysVisitor.h index e52b0c69591..8ebbcbb4685 100644 --- a/src/Interpreters/CollectJoinOnKeysVisitor.h +++ b/src/Interpreters/CollectJoinOnKeysVisitor.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include diff --git a/src/Interpreters/ColumnAliasesVisitor.h b/src/Interpreters/ColumnAliasesVisitor.h index b593842e133..b7f271a0381 100644 --- a/src/Interpreters/ColumnAliasesVisitor.h +++ b/src/Interpreters/ColumnAliasesVisitor.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include namespace DB diff --git a/src/Interpreters/ConvertFunctionOrLikeVisitor.h b/src/Interpreters/ConvertFunctionOrLikeVisitor.h index f3da56eb60e..c0000c371ec 100644 --- a/src/Interpreters/ConvertFunctionOrLikeVisitor.h +++ b/src/Interpreters/ConvertFunctionOrLikeVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB diff --git a/src/Interpreters/ConvertStringsToEnumVisitor.h b/src/Interpreters/ConvertStringsToEnumVisitor.h index b1389f40654..d7a77eb651c 100644 --- a/src/Interpreters/ConvertStringsToEnumVisitor.h +++ b/src/Interpreters/ConvertStringsToEnumVisitor.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include diff --git a/src/Interpreters/CrossToInnerJoinVisitor.h b/src/Interpreters/CrossToInnerJoinVisitor.h index 704be42d3c1..21647feb9f5 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.h +++ b/src/Interpreters/CrossToInnerJoinVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 9bcc086d6c7..331cc5092c5 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -15,7 +15,7 @@ #include #include #include -#include +#include namespace DB diff --git a/src/Interpreters/DuplicateOrderByVisitor.h b/src/Interpreters/DuplicateOrderByVisitor.h index de8cb4c8f32..0e1bca60545 100644 --- a/src/Interpreters/DuplicateOrderByVisitor.h +++ b/src/Interpreters/DuplicateOrderByVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include diff --git a/src/Interpreters/ExecuteScalarSubqueriesVisitor.h b/src/Interpreters/ExecuteScalarSubqueriesVisitor.h index d702404dab6..4c8f3bd58e5 100644 --- a/src/Interpreters/ExecuteScalarSubqueriesVisitor.h +++ b/src/Interpreters/ExecuteScalarSubqueriesVisitor.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include namespace DB diff --git a/src/Interpreters/ExtractExpressionInfoVisitor.h b/src/Interpreters/ExtractExpressionInfoVisitor.h index a8cc3a39f89..873ef0ebb4a 100644 --- a/src/Interpreters/ExtractExpressionInfoVisitor.h +++ b/src/Interpreters/ExtractExpressionInfoVisitor.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include namespace DB diff --git a/src/Interpreters/FunctionMaskingArgumentCheckVisitor.h b/src/Interpreters/FunctionMaskingArgumentCheckVisitor.h index 1f3c44f8087..548cdee67d9 100644 --- a/src/Interpreters/FunctionMaskingArgumentCheckVisitor.h +++ b/src/Interpreters/FunctionMaskingArgumentCheckVisitor.h @@ -2,7 +2,7 @@ #include -#include +#include #include #include diff --git a/src/Interpreters/GatherFunctionQuantileVisitor.h b/src/Interpreters/GatherFunctionQuantileVisitor.h index 74388e7161a..374c6731e52 100644 --- a/src/Interpreters/GatherFunctionQuantileVisitor.h +++ b/src/Interpreters/GatherFunctionQuantileVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB diff --git a/src/Interpreters/GetAggregatesVisitor.h b/src/Interpreters/GetAggregatesVisitor.h index 036d50ba4d6..e7fb010cd27 100644 --- a/src/Interpreters/GetAggregatesVisitor.h +++ b/src/Interpreters/GetAggregatesVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include #include diff --git a/src/Interpreters/GlobalSubqueriesVisitor.h b/src/Interpreters/GlobalSubqueriesVisitor.h index dc9294be878..5698e07fcf9 100644 --- a/src/Interpreters/GlobalSubqueriesVisitor.h +++ b/src/Interpreters/GlobalSubqueriesVisitor.h @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/GroupByFunctionKeysVisitor.h b/src/Interpreters/GroupByFunctionKeysVisitor.h index e21e71d7a72..0a8e3b4e7f4 100644 --- a/src/Interpreters/GroupByFunctionKeysVisitor.h +++ b/src/Interpreters/GroupByFunctionKeysVisitor.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/GroupingSetsRewriterVisitor.h b/src/Interpreters/GroupingSetsRewriterVisitor.h index 3d56bf1917d..4388a3b9b0b 100644 --- a/src/Interpreters/GroupingSetsRewriterVisitor.h +++ b/src/Interpreters/GroupingSetsRewriterVisitor.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB { diff --git a/src/Interpreters/IdentifierSemantic.h b/src/Interpreters/IdentifierSemantic.h index 178bd291beb..65b7d87b428 100644 --- a/src/Interpreters/IdentifierSemantic.h +++ b/src/Interpreters/IdentifierSemantic.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InJoinSubqueriesPreprocessor.cpp b/src/Interpreters/InJoinSubqueriesPreprocessor.cpp index 58faeb41a15..b0bd645cf50 100644 --- a/src/Interpreters/InJoinSubqueriesPreprocessor.cpp +++ b/src/Interpreters/InJoinSubqueriesPreprocessor.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterExplainQuery.cpp b/src/Interpreters/InterpreterExplainQuery.cpp index 2864b433e00..22386a6e486 100644 --- a/src/Interpreters/InterpreterExplainQuery.cpp +++ b/src/Interpreters/InterpreterExplainQuery.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp index c695dae6d53..7f12bae06f4 100644 --- a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp +++ b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include #include diff --git a/src/Interpreters/JoinToSubqueryTransformVisitor.h b/src/Interpreters/JoinToSubqueryTransformVisitor.h index 96420512ae6..596e7274c3c 100644 --- a/src/Interpreters/JoinToSubqueryTransformVisitor.h +++ b/src/Interpreters/JoinToSubqueryTransformVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp index b88bb5d1caf..95d7be4acf2 100644 --- a/src/Interpreters/JoinedTables.cpp +++ b/src/Interpreters/JoinedTables.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/MarkTableIdentifiersVisitor.h b/src/Interpreters/MarkTableIdentifiersVisitor.h index d05c067397b..ba52fd656c7 100644 --- a/src/Interpreters/MarkTableIdentifiersVisitor.h +++ b/src/Interpreters/MarkTableIdentifiersVisitor.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Interpreters/MonotonicityCheckVisitor.h b/src/Interpreters/MonotonicityCheckVisitor.h index c95f5209760..cd10ba1e6c7 100644 --- a/src/Interpreters/MonotonicityCheckVisitor.h +++ b/src/Interpreters/MonotonicityCheckVisitor.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 26b8bce1f4a..720636d0434 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.h b/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.h index e8194f0dfe1..d9896f29454 100644 --- a/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.h +++ b/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.h @@ -2,7 +2,7 @@ #include -#include +#include #include #include diff --git a/src/Interpreters/OptimizeIfChains.h b/src/Interpreters/OptimizeIfChains.h index 79a4f497ff8..6387c342f16 100644 --- a/src/Interpreters/OptimizeIfChains.h +++ b/src/Interpreters/OptimizeIfChains.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.h b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.h index d546db40df7..cc1e285fe86 100644 --- a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.h +++ b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB diff --git a/src/Interpreters/PredicateRewriteVisitor.h b/src/Interpreters/PredicateRewriteVisitor.h index d2b9ece9306..cbaba04f7af 100644 --- a/src/Interpreters/PredicateRewriteVisitor.h +++ b/src/Interpreters/PredicateRewriteVisitor.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include namespace DB diff --git a/src/Interpreters/QueryAliasesVisitor.h b/src/Interpreters/QueryAliasesVisitor.h index 6e79cfc77be..d1cf68478db 100644 --- a/src/Interpreters/QueryAliasesVisitor.h +++ b/src/Interpreters/QueryAliasesVisitor.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB { diff --git a/src/Interpreters/RedundantFunctionsInOrderByVisitor.h b/src/Interpreters/RedundantFunctionsInOrderByVisitor.h index 60c9fcf2a24..21042d1e25c 100644 --- a/src/Interpreters/RedundantFunctionsInOrderByVisitor.h +++ b/src/Interpreters/RedundantFunctionsInOrderByVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include #include diff --git a/src/Interpreters/RemoveInjectiveFunctionsVisitor.h b/src/Interpreters/RemoveInjectiveFunctionsVisitor.h index a3bbd562407..8cdade85b6d 100644 --- a/src/Interpreters/RemoveInjectiveFunctionsVisitor.h +++ b/src/Interpreters/RemoveInjectiveFunctionsVisitor.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include namespace DB diff --git a/src/Interpreters/RenameColumnVisitor.h b/src/Interpreters/RenameColumnVisitor.h index bf0cc21c630..aae84d04ecf 100644 --- a/src/Interpreters/RenameColumnVisitor.h +++ b/src/Interpreters/RenameColumnVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB { diff --git a/src/Interpreters/RequiredSourceColumnsVisitor.h b/src/Interpreters/RequiredSourceColumnsVisitor.h index f197d126223..0adc80728e8 100644 --- a/src/Interpreters/RequiredSourceColumnsVisitor.h +++ b/src/Interpreters/RequiredSourceColumnsVisitor.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB { diff --git a/src/Interpreters/RewriteAnyFunctionVisitor.h b/src/Interpreters/RewriteAnyFunctionVisitor.h index d29af322711..bbca6d51f1c 100644 --- a/src/Interpreters/RewriteAnyFunctionVisitor.h +++ b/src/Interpreters/RewriteAnyFunctionVisitor.h @@ -3,7 +3,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Interpreters/RewriteCountDistinctVisitor.h b/src/Interpreters/RewriteCountDistinctVisitor.h index b15c03f9c24..dd1d44e2ae7 100644 --- a/src/Interpreters/RewriteCountDistinctVisitor.h +++ b/src/Interpreters/RewriteCountDistinctVisitor.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include "Interpreters/TreeRewriter.h" namespace DB diff --git a/src/Interpreters/RewriteFunctionToSubcolumnVisitor.h b/src/Interpreters/RewriteFunctionToSubcolumnVisitor.h index 4d064bdee10..71281cc9501 100644 --- a/src/Interpreters/RewriteFunctionToSubcolumnVisitor.h +++ b/src/Interpreters/RewriteFunctionToSubcolumnVisitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB diff --git a/src/Interpreters/RewriteOrderByVisitor.hpp b/src/Interpreters/RewriteOrderByVisitor.hpp index a3857e046ca..b6ea2828066 100644 --- a/src/Interpreters/RewriteOrderByVisitor.hpp +++ b/src/Interpreters/RewriteOrderByVisitor.hpp @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB { diff --git a/src/Interpreters/RewriteSumIfFunctionVisitor.h b/src/Interpreters/RewriteSumIfFunctionVisitor.h index 86aeef65377..a0d7236bf4c 100644 --- a/src/Interpreters/RewriteSumIfFunctionVisitor.h +++ b/src/Interpreters/RewriteSumIfFunctionVisitor.h @@ -3,7 +3,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Interpreters/SelectIntersectExceptQueryVisitor.h b/src/Interpreters/SelectIntersectExceptQueryVisitor.h index daf6d2ea0df..764d9d233d4 100644 --- a/src/Interpreters/SelectIntersectExceptQueryVisitor.h +++ b/src/Interpreters/SelectIntersectExceptQueryVisitor.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include diff --git a/src/Interpreters/SubstituteColumnOptimizer.cpp b/src/Interpreters/SubstituteColumnOptimizer.cpp index da738d3db1e..64b2878a0ab 100644 --- a/src/Interpreters/SubstituteColumnOptimizer.cpp +++ b/src/Interpreters/SubstituteColumnOptimizer.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include diff --git a/src/Interpreters/TableOverrideUtils.cpp b/src/Interpreters/TableOverrideUtils.cpp index 58e885380bf..9703e3e9e77 100644 --- a/src/Interpreters/TableOverrideUtils.cpp +++ b/src/Interpreters/TableOverrideUtils.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.h b/src/Interpreters/TranslateQualifiedNamesVisitor.h index 73e45fc7ea0..86e8e5df8d2 100644 --- a/src/Interpreters/TranslateQualifiedNamesVisitor.h +++ b/src/Interpreters/TranslateQualifiedNamesVisitor.h @@ -4,7 +4,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index a6f3a38303c..e1966597e7d 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Interpreters/InDepthNodeVisitor.h b/src/Parsers/InDepthNodeVisitor.h similarity index 100% rename from src/Interpreters/InDepthNodeVisitor.h rename to src/Parsers/InDepthNodeVisitor.h diff --git a/src/Parsers/InsertQuerySettingsPushDownVisitor.h b/src/Parsers/InsertQuerySettingsPushDownVisitor.h index d1f161fc89b..0bfc5705077 100644 --- a/src/Parsers/InsertQuerySettingsPushDownVisitor.h +++ b/src/Parsers/InsertQuerySettingsPushDownVisitor.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB { diff --git a/src/Parsers/QueryWithOutputSettingsPushDownVisitor.h b/src/Parsers/QueryWithOutputSettingsPushDownVisitor.h index fde8a07b555..53b8f64b96a 100644 --- a/src/Parsers/QueryWithOutputSettingsPushDownVisitor.h +++ b/src/Parsers/QueryWithOutputSettingsPushDownVisitor.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB { diff --git a/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp b/src/Parsers/maskSensitiveInfoInQueryForLogging.cpp similarity index 98% rename from src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp rename to src/Parsers/maskSensitiveInfoInQueryForLogging.cpp index 6b2c865d166..ebadfa5425b 100644 --- a/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp +++ b/src/Parsers/maskSensitiveInfoInQueryForLogging.cpp @@ -1,7 +1,6 @@ -#include +#include -#include -#include +#include #include #include #include @@ -9,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -278,7 +277,7 @@ namespace return; } - if (FormatFactory::instance().getAllFormats().contains(format)) + if (KnownFormatNames::instance().exists(format)) return; /// The argument after 'url' is a format: s3('url', 'format', ...) /// The argument after 'url' is not a format so we do our replacement: @@ -305,7 +304,7 @@ namespace /// Skip 1 or 2 arguments with table_function() or db.table or 'db', 'table'. const auto * table_function = arguments[arg_num]->as(); - if (table_function && TableFunctionFactory::instance().isTableFunctionName(table_function->name)) + if (table_function && KnownTableFunctionNames::instance().exists(table_function->name)) { ++arg_num; } diff --git a/src/Interpreters/maskSensitiveInfoInQueryForLogging.h b/src/Parsers/maskSensitiveInfoInQueryForLogging.h similarity index 100% rename from src/Interpreters/maskSensitiveInfoInQueryForLogging.h rename to src/Parsers/maskSensitiveInfoInQueryForLogging.h diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index 41c9c1996b1..46df784986f 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index c0bc5ad8da9..c4dd9a8442b 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/transformQueryForExternalDatabase.cpp b/src/Storages/transformQueryForExternalDatabase.cpp index 51b11680f82..1b3a00c417d 100644 --- a/src/Storages/transformQueryForExternalDatabase.cpp +++ b/src/Storages/transformQueryForExternalDatabase.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/TableFunctions/TableFunctionFactory.cpp b/src/TableFunctions/TableFunctionFactory.cpp index 3d2a72ddc9e..ff001661000 100644 --- a/src/TableFunctions/TableFunctionFactory.cpp +++ b/src/TableFunctions/TableFunctionFactory.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,8 @@ void TableFunctionFactory::registerFunction( && !case_insensitive_table_functions.emplace(Poco::toLower(name), value).second) throw Exception("TableFunctionFactory: the case insensitive table function name '" + name + "' is not unique", ErrorCodes::LOGICAL_ERROR); + + KnownTableFunctionNames::instance().add(name, (case_sensitiveness == CaseInsensitive)); } TableFunctionPtr TableFunctionFactory::get( From 28fec44ffcf7d27dfea25206538aae02cde6558f Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 14 Nov 2022 20:45:28 +0000 Subject: [PATCH 26/78] Fix merging null values in AggregateFunctionSumMap --- .../AggregateFunctionSumMap.h | 17 ++- src/Common/FieldVisitorSum.cpp | 7 +- .../02480_max_map_null_totals.reference | 119 ++++++++++++++++++ .../0_stateless/02480_max_map_null_totals.sql | 39 ++++++ 4 files changed, 178 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/02480_max_map_null_totals.reference create mode 100644 tests/queries/0_stateless/02480_max_map_null_totals.sql diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index 6a8fc9e99d8..cee59fcc907 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -202,7 +202,7 @@ public: auto & merged_maps = this->data(place).merged_maps; for (size_t col = 0, size = values_types.size(); col < size; ++col) { - const auto & array_column = assert_cast(*columns[col + 1]); + const auto & array_column = assert_cast(*columns[col + 1]); const IColumn & value_column = array_column.getData(); const IColumn::Offsets & offsets = array_column.getOffsets(); const size_t values_vec_offset = offsets[row_num - 1]; @@ -532,7 +532,12 @@ private: public: explicit FieldVisitorMax(const Field & rhs_) : rhs(rhs_) {} - bool operator() (Null &) const { throw Exception("Cannot compare Nulls", ErrorCodes::LOGICAL_ERROR); } + bool operator() (Null &) const + { + /// Do not update current value, skip nulls + return false; + } + bool operator() (AggregateFunctionStateData &) const { throw Exception("Cannot compare AggregateFunctionStates", ErrorCodes::LOGICAL_ERROR); } bool operator() (Array & x) const { return compareImpl(x); } @@ -567,7 +572,13 @@ private: public: explicit FieldVisitorMin(const Field & rhs_) : rhs(rhs_) {} - bool operator() (Null &) const { throw Exception("Cannot compare Nulls", ErrorCodes::LOGICAL_ERROR); } + + bool operator() (Null &) const + { + /// Do not update current value, skip nulls + return false; + } + bool operator() (AggregateFunctionStateData &) const { throw Exception("Cannot sum AggregateFunctionStates", ErrorCodes::LOGICAL_ERROR); } bool operator() (Array & x) const { return compareImpl(x); } diff --git a/src/Common/FieldVisitorSum.cpp b/src/Common/FieldVisitorSum.cpp index 2c404c33177..332b1a4255b 100644 --- a/src/Common/FieldVisitorSum.cpp +++ b/src/Common/FieldVisitorSum.cpp @@ -21,7 +21,12 @@ bool FieldVisitorSum::operator() (UInt64 & x) const bool FieldVisitorSum::operator() (Float64 & x) const { x += rhs.get(); return x != 0; } -bool FieldVisitorSum::operator() (Null &) const { throw Exception("Cannot sum Nulls", ErrorCodes::LOGICAL_ERROR); } +bool FieldVisitorSum::operator() (Null &) const +{ + /// Do not add anything + return rhs != 0; +} + bool FieldVisitorSum::operator() (String &) const { throw Exception("Cannot sum Strings", ErrorCodes::LOGICAL_ERROR); } bool FieldVisitorSum::operator() (Array &) const { throw Exception("Cannot sum Arrays", ErrorCodes::LOGICAL_ERROR); } bool FieldVisitorSum::operator() (Tuple &) const { throw Exception("Cannot sum Tuples", ErrorCodes::LOGICAL_ERROR); } diff --git a/tests/queries/0_stateless/02480_max_map_null_totals.reference b/tests/queries/0_stateless/02480_max_map_null_totals.reference new file mode 100644 index 00000000000..5cc9b5a495f --- /dev/null +++ b/tests/queries/0_stateless/02480_max_map_null_totals.reference @@ -0,0 +1,119 @@ +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) + +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) + +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([0],[0]) +([2],[2]) +([1],[1]) + +([0,2],[0,2]) +([0],[0]) +([2],[2]) +([1],[1]) +([0,2],[0,2]) +([0],[0]) +([2],[2]) +([1],[1]) +([0,2],[0,2]) +- +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) + +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) + +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([0],[0]) +([2],[2]) +([1],[1]) + +([0,2],[0,2]) +([0],[0]) +([2],[2]) +([1],[1]) +([0,2],[0,2]) +([0],[0]) +([2],[2]) +([1],[1]) +([0,2],[0,2]) +- +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) + +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) + +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([-1,0],[0,0]) +([1,2],[0,2]) +([0,1],[0,1]) +([-1,0,1,2],[0,0,0,2]) +([0],[0]) +([2],[2]) +([1],[1]) + +([0,2],[0,2]) +([0],[0]) +([2],[2]) +([1],[1]) +([0,2],[0,2]) +([0],[0]) +([2],[2]) +([1],[1]) +([0,2],[0,2]) diff --git a/tests/queries/0_stateless/02480_max_map_null_totals.sql b/tests/queries/0_stateless/02480_max_map_null_totals.sql new file mode 100644 index 00000000000..81e2a5c4243 --- /dev/null +++ b/tests/queries/0_stateless/02480_max_map_null_totals.sql @@ -0,0 +1,39 @@ +SELECT maxMap([number % 3, number % 4 - 1], [number, NULL]) FROM numbers(3) GROUP BY number WITH TOTALS; +SELECT maxMap([number % 3, number % 4 - 1], [number, NULL]) FROM numbers(3) GROUP BY number WITH ROLLUP; +SELECT maxMap([number % 3, number % 4 - 1], [number, NULL]) FROM numbers(3) GROUP BY number WITH CUBE; + +SELECT minMap([number % 3, number % 4 - 1], [number, NULL]) FROM numbers(3) GROUP BY number WITH TOTALS; +SELECT minMap([number % 3, number % 4 - 1], [number, NULL]) FROM numbers(3) GROUP BY number WITH ROLLUP; +SELECT minMap([number % 3, number % 4 - 1], [number, NULL]) FROM numbers(3) GROUP BY number WITH CUBE; + +SELECT sumMap([number % 3, number % 4 - 1], [number, NULL]) FROM numbers(3) GROUP BY number WITH TOTALS; +SELECT sumMap([number % 3, number % 4 - 1], [number, NULL]) FROM numbers(3) GROUP BY number WITH ROLLUP; +SELECT sumMap([number % 3, number % 4 - 1], [number, NULL]) FROM numbers(3) GROUP BY number WITH CUBE; + +SELECT '-'; + +SELECT maxMap([number % 3, number % 4 - 1], [number :: Float64, NULL]) FROM numbers(3) GROUP BY number WITH TOTALS; +SELECT maxMap([number % 3, number % 4 - 1], [number :: Float64, NULL]) FROM numbers(3) GROUP BY number WITH ROLLUP; +SELECT maxMap([number % 3, number % 4 - 1], [number :: Float64, NULL]) FROM numbers(3) GROUP BY number WITH CUBE; + +SELECT minMap([number % 3, number % 4 - 1], [number :: Float64, NULL]) FROM numbers(3) GROUP BY number WITH TOTALS; +SELECT minMap([number % 3, number % 4 - 1], [number :: Float64, NULL]) FROM numbers(3) GROUP BY number WITH ROLLUP; +SELECT minMap([number % 3, number % 4 - 1], [number :: Float64, NULL]) FROM numbers(3) GROUP BY number WITH CUBE; + +SELECT sumMap([number % 3, number % 4 - 1], [number :: Float64, NULL]) FROM numbers(3) GROUP BY number WITH TOTALS; +SELECT sumMap([number % 3, number % 4 - 1], [number :: Float64, NULL]) FROM numbers(3) GROUP BY number WITH ROLLUP; +SELECT sumMap([number % 3, number % 4 - 1], [number :: Float64, NULL]) FROM numbers(3) GROUP BY number WITH CUBE; + +SELECT '-'; + +SELECT maxMap([number % 3, number % 4 - 1], [number :: UInt256, NULL]) FROM numbers(3) GROUP BY number WITH TOTALS; +SELECT maxMap([number % 3, number % 4 - 1], [number :: UInt256, NULL]) FROM numbers(3) GROUP BY number WITH ROLLUP; +SELECT maxMap([number % 3, number % 4 - 1], [number :: UInt256, NULL]) FROM numbers(3) GROUP BY number WITH CUBE; + +SELECT minMap([number % 3, number % 4 - 1], [number :: UInt256, NULL]) FROM numbers(3) GROUP BY number WITH TOTALS; +SELECT minMap([number % 3, number % 4 - 1], [number :: UInt256, NULL]) FROM numbers(3) GROUP BY number WITH ROLLUP; +SELECT minMap([number % 3, number % 4 - 1], [number :: UInt256, NULL]) FROM numbers(3) GROUP BY number WITH CUBE; + +SELECT sumMap([number % 3, number % 4 - 1], [number :: UInt256, NULL]) FROM numbers(3) GROUP BY number WITH TOTALS; +SELECT sumMap([number % 3, number % 4 - 1], [number :: UInt256, NULL]) FROM numbers(3) GROUP BY number WITH ROLLUP; +SELECT sumMap([number % 3, number % 4 - 1], [number :: UInt256, NULL]) FROM numbers(3) GROUP BY number WITH CUBE; From 5eba20e1198b0bcd3fa91caa2a597e53cd5432fd Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Tue, 15 Nov 2022 09:46:24 +0800 Subject: [PATCH 27/78] modify return type from Int64 to UInt64 --- docs/en/sql-reference/functions/math-functions.md | 2 +- src/Functions/factorial.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 7f349686d8b..47c27268b09 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -553,7 +553,7 @@ Result: ## factorial(n) -Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is Int64. +Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is UInt64. The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater overflows the range for Int64 and will cause exception throw. diff --git a/src/Functions/factorial.cpp b/src/Functions/factorial.cpp index 5c46b97c193..b76ef90a48d 100644 --- a/src/Functions/factorial.cpp +++ b/src/Functions/factorial.cpp @@ -15,7 +15,7 @@ namespace ErrorCodes template struct FactorialImpl { - using ResultType = Int64; + using ResultType = UInt64; static const constexpr bool allow_decimal = false; static const constexpr bool allow_fixed_string = false; static const constexpr bool allow_string_integer = false; @@ -101,7 +101,7 @@ REGISTER_FUNCTION(Factorial) factory.registerFunction( { R"( -Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is Int64. +Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is UInt64. The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater overflows the range for Int64 and will cause exception throw. )", From 7fdb2f400e97db601c2a290ab73da67f1c9695ad Mon Sep 17 00:00:00 2001 From: freedomDR <1640145602@qq.com> Date: Mon, 14 Nov 2022 05:31:27 +0000 Subject: [PATCH 28/78] unhex support FixedString --- .../FunctionsBinaryRepresentation.cpp | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/Functions/FunctionsBinaryRepresentation.cpp b/src/Functions/FunctionsBinaryRepresentation.cpp index 775696ded8a..9e475da87c2 100644 --- a/src/Functions/FunctionsBinaryRepresentation.cpp +++ b/src/Functions/FunctionsBinaryRepresentation.cpp @@ -5,6 +5,8 @@ #include #include #include +#include "Columns/IColumn.h" +#include "DataTypes/IDataType.h" #include #include #include @@ -566,7 +568,8 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (!isString(arguments[0])) + WhichDataType which(arguments[0]); + if (!which.isStringOrFixedString()) throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -612,6 +615,39 @@ public: return col_res; } + else if (const ColumnFixedString * col_fix_string = checkAndGetColumn(column.get())) + { + auto col_res = ColumnString::create(); + + ColumnString::Chars & out_vec = col_res->getChars(); + ColumnString::Offsets & out_offsets = col_res->getOffsets(); + + const ColumnString::Chars & in_vec = col_fix_string->getChars(); + size_t n = col_fix_string->getN(); + + size_t size = col_fix_string->size(); + out_offsets.resize(size); + out_vec.resize(in_vec.size() / word_size + size); + + char * begin = reinterpret_cast(out_vec.data()); + char * pos = begin; + size_t prev_offset = 0; + + for (size_t i = 0; i < size; ++i) + { + size_t new_offset = prev_offset + n; + + Impl::decode(reinterpret_cast(&in_vec[prev_offset]), reinterpret_cast(&in_vec[new_offset]), pos); + + out_offsets[i] = pos - begin; + + prev_offset = new_offset; + } + + out_vec.resize(pos - begin); + + return col_res; + } else { throw Exception("Illegal column " + arguments[0].column->getName() From 0fb23b05b9f129d6437117c09720ed78eb108f58 Mon Sep 17 00:00:00 2001 From: freedomDR <1640145602@qq.com> Date: Mon, 14 Nov 2022 05:32:02 +0000 Subject: [PATCH 29/78] add tests --- tests/queries/0_stateless/02474_unhex_in_fix_string.reference | 1 + tests/queries/0_stateless/02474_unhex_in_fix_string.sql | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 tests/queries/0_stateless/02474_unhex_in_fix_string.reference create mode 100644 tests/queries/0_stateless/02474_unhex_in_fix_string.sql diff --git a/tests/queries/0_stateless/02474_unhex_in_fix_string.reference b/tests/queries/0_stateless/02474_unhex_in_fix_string.reference new file mode 100644 index 00000000000..407a8b39948 --- /dev/null +++ b/tests/queries/0_stateless/02474_unhex_in_fix_string.reference @@ -0,0 +1 @@ +ClickHouse ClickHouse diff --git a/tests/queries/0_stateless/02474_unhex_in_fix_string.sql b/tests/queries/0_stateless/02474_unhex_in_fix_string.sql new file mode 100644 index 00000000000..288336aa4fa --- /dev/null +++ b/tests/queries/0_stateless/02474_unhex_in_fix_string.sql @@ -0,0 +1,4 @@ +drop table if exists unhex_in_fix_string_table; +create table unhex_in_fix_string_table ( dt Date, s1 FixedString(20), s2 String) engine=MergeTree partition by dt order by tuple(); +insert into unhex_in_fix_string_table values(today(), '436C69636B486F757365', '436C69636B486F757365'); +select unhex(s1), unhex(s2) from unhex_in_fix_string_table; From 5753be7b9925404ceb3b312739bb6aa44a8ee0a3 Mon Sep 17 00:00:00 2001 From: freedomDR <1640145602@qq.com> Date: Mon, 14 Nov 2022 05:43:56 +0000 Subject: [PATCH 30/78] remove useless head file --- src/Functions/FunctionsBinaryRepresentation.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Functions/FunctionsBinaryRepresentation.cpp b/src/Functions/FunctionsBinaryRepresentation.cpp index 9e475da87c2..187d82981db 100644 --- a/src/Functions/FunctionsBinaryRepresentation.cpp +++ b/src/Functions/FunctionsBinaryRepresentation.cpp @@ -5,8 +5,6 @@ #include #include #include -#include "Columns/IColumn.h" -#include "DataTypes/IDataType.h" #include #include #include From 37c448c6be84cb12bc9a18b64efdbd1d8def77b1 Mon Sep 17 00:00:00 2001 From: freedomDR <1640145602@qq.com> Date: Mon, 14 Nov 2022 05:54:54 +0000 Subject: [PATCH 31/78] complete unhex doc --- docs/en/sql-reference/functions/encoding-functions.md | 2 +- docs/zh/sql-reference/functions/encoding-functions.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md index 4a6e46e1759..cccc02c2553 100644 --- a/docs/en/sql-reference/functions/encoding-functions.md +++ b/docs/en/sql-reference/functions/encoding-functions.md @@ -185,7 +185,7 @@ unhex(arg) **Arguments** -- `arg` — A string containing any number of hexadecimal digits. Type: [String](../../sql-reference/data-types/string.md). +- `arg` — A string containing any number of hexadecimal digits. Type: [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md). Supports both uppercase and lowercase letters `A-F`. The number of hexadecimal digits does not have to be even. If it is odd, the last digit is interpreted as the least significant half of the `00-0F` byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn’t thrown). For a numeric argument the inverse of hex(N) is not performed by unhex(). diff --git a/docs/zh/sql-reference/functions/encoding-functions.md b/docs/zh/sql-reference/functions/encoding-functions.md index 5e9260ff94a..846d6c58f40 100644 --- a/docs/zh/sql-reference/functions/encoding-functions.md +++ b/docs/zh/sql-reference/functions/encoding-functions.md @@ -181,7 +181,7 @@ unhex(arg) **参数** -- `arg` — 包含任意数量的十六进制数字的字符串。类型为:[String](../../sql-reference/data-types/string.md)。 +- `arg` — 包含任意数量的十六进制数字的字符串。类型为:[String](../../sql-reference/data-types/string.md),[FixedString](../../sql-reference/data-types/fixedstring.md)。 支持大写和小写字母A-F。十六进制数字的数量不必是偶数。如果是奇数,则最后一位数被解释为00-0F字节的低位。如果参数字符串包含除十六进制数字以外的任何内容,则返回一些实现定义的结果(不抛出异常)。对于数字参数, unhex()不执行 hex(N) 的倒数。 From 0017416069fa23e1e530d6a56197669113657489 Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Tue, 15 Nov 2022 11:55:54 +0100 Subject: [PATCH 32/78] Update src/Common/FieldVisitorSum.cpp --- src/Common/FieldVisitorSum.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/FieldVisitorSum.cpp b/src/Common/FieldVisitorSum.cpp index 332b1a4255b..db7b4850204 100644 --- a/src/Common/FieldVisitorSum.cpp +++ b/src/Common/FieldVisitorSum.cpp @@ -24,7 +24,7 @@ bool FieldVisitorSum::operator() (Float64 & x) const { x += rhs.get(); bool FieldVisitorSum::operator() (Null &) const { /// Do not add anything - return rhs != 0; + return false; } bool FieldVisitorSum::operator() (String &) const { throw Exception("Cannot sum Strings", ErrorCodes::LOGICAL_ERROR); } From d7c882951f1e3c5c66d744e9105a6d30a419a6e0 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 15 Nov 2022 12:36:28 +0000 Subject: [PATCH 33/78] Fix nullptr dereference in collectScopeValidIdentifiersForTypoCorrection --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 20 ++++++++++++++++++- .../02480_analyzer_alias_nullptr.reference | 0 .../02480_analyzer_alias_nullptr.sql | 3 +++ 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02480_analyzer_alias_nullptr.reference create mode 100644 tests/queries/0_stateless/02480_analyzer_alias_nullptr.sql diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 24b88a729be..760d036ab4f 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -1517,6 +1517,7 @@ void QueryAnalyzer::collectScopeValidIdentifiersForTypoCorrection( { for (const auto & [name, expression] : scope.alias_name_to_expression_node) { + assert(expression); auto expression_identifier = Identifier(name); valid_identifiers_result.insert(expression_identifier); @@ -2170,7 +2171,24 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier auto & alias_identifier_node = it->second->as(); auto identifier = alias_identifier_node.getIdentifier(); auto lookup_result = tryResolveIdentifier(IdentifierLookup{identifier, identifier_lookup.lookup_context}, scope, identifier_resolve_settings); - it->second = lookup_result.resolved_identifier; + if (lookup_result.isResolved()) + { + it->second = lookup_result.resolved_identifier; + } + else + { + alias_name_to_node_map.erase(it); + + std::unordered_set valid_identifiers; + collectScopeWithParentScopesValidIdentifiersForTypoCorrection(identifier, scope, true, false, false, valid_identifiers); + + auto hints = collectIdentifierTypoHints(identifier, valid_identifiers); + throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, "Unknown {} identifier '{}' in scope {}{}", + toStringLowercase(IdentifierLookupContext::EXPRESSION), + identifier.getFullName(), + scope.scope_node->formatASTForErrorMessage(), + getHintsErrorMessageSuffix(hints)); + } /** During collection of aliases if node is identifier and has alias, we cannot say if it is * column or function node. Check QueryExpressionsAliasVisitor documentation for clarification. diff --git a/tests/queries/0_stateless/02480_analyzer_alias_nullptr.reference b/tests/queries/0_stateless/02480_analyzer_alias_nullptr.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02480_analyzer_alias_nullptr.sql b/tests/queries/0_stateless/02480_analyzer_alias_nullptr.sql new file mode 100644 index 00000000000..f6b381e5c70 --- /dev/null +++ b/tests/queries/0_stateless/02480_analyzer_alias_nullptr.sql @@ -0,0 +1,3 @@ +SET allow_experimental_analyzer = 1; + +SELECT min(b), x AS b FROM (SELECT max(number) FROM numbers(1)); -- { serverError UNKNOWN_IDENTIFIER } From 143b67d0beecfd0f51c6d4499a0383e003baea96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 15 Nov 2022 15:40:06 +0100 Subject: [PATCH 34/78] Fix ubsan in AggregateFunctionMinMaxAny::read with high sizes --- src/AggregateFunctions/AggregateFunctionMinMaxAny.h | 7 ++++++- src/Common/Arena.h | 2 +- .../0_stateless/02481_i43247_ubsan_in_minmaxany.reference | 0 .../0_stateless/02481_i43247_ubsan_in_minmaxany.sql | 3 +++ 4 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.reference create mode 100644 tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.sql diff --git a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h index 46be7331195..18f065caaf9 100644 --- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h +++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h @@ -29,6 +29,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int NOT_IMPLEMENTED; + extern const int TOO_LARGE_STRING_SIZE; } /** Aggregate functions that store one of passed values. @@ -521,7 +522,11 @@ public: { if (capacity < rhs_size) { - capacity = static_cast(roundUpToPowerOfTwoOrZero(rhs_size)); + capacity = static_cast(roundUpToPowerOfTwoOrZero(rhs_size)); + /// It might happen if the size was too big and the rounded value does not fit a size_t + if (unlikely(capacity <= rhs_size)) + throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({})", rhs_size); + /// Don't free large_data here. large_data = arena->alloc(capacity); } diff --git a/src/Common/Arena.h b/src/Common/Arena.h index 17d53acd8f7..5772dff6bca 100644 --- a/src/Common/Arena.h +++ b/src/Common/Arena.h @@ -141,7 +141,7 @@ public: /// Get piece of memory, without alignment. char * alloc(size_t size) { - if (unlikely(head->pos + size > head->end)) + if (unlikely(static_cast(size) > head->end - head->pos)) addMemoryChunk(size); char * res = head->pos; diff --git a/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.reference b/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.sql b/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.sql new file mode 100644 index 00000000000..7204053de04 --- /dev/null +++ b/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.sql @@ -0,0 +1,3 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/43247 +SELECT finalizeAggregation(CAST('AggregateFunction(categoricalInformationValue, Nullable(UInt8), UInt8)AggregateFunction(categoricalInformationValue, Nullable(UInt8), UInt8)', + 'AggregateFunction(min, String)')); -- { serverError 131 } From d49b65cf1ead8fbd4a5d996c2d405f18ff954aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 15 Nov 2022 16:31:12 +0100 Subject: [PATCH 35/78] Fix capacity check --- src/AggregateFunctions/AggregateFunctionMinMaxAny.h | 2 +- .../0_stateless/02481_i43247_ubsan_in_minmaxany.reference | 1 + tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.sql | 4 ++++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h index 18f065caaf9..1f3c51c1c1c 100644 --- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h +++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h @@ -524,7 +524,7 @@ public: { capacity = static_cast(roundUpToPowerOfTwoOrZero(rhs_size)); /// It might happen if the size was too big and the rounded value does not fit a size_t - if (unlikely(capacity <= rhs_size)) + if (unlikely(capacity < rhs_size)) throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "String size is too big ({})", rhs_size); /// Don't free large_data here. diff --git a/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.reference b/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.reference index e69de29bb2d..3e3abfb9a41 100644 --- a/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.reference +++ b/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.reference @@ -0,0 +1 @@ +0123456789012345678901234567890123456789012345678901234567890123 diff --git a/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.sql b/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.sql index 7204053de04..7dc29c2daae 100644 --- a/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.sql +++ b/tests/queries/0_stateless/02481_i43247_ubsan_in_minmaxany.sql @@ -1,3 +1,7 @@ -- https://github.com/ClickHouse/ClickHouse/issues/43247 SELECT finalizeAggregation(CAST('AggregateFunction(categoricalInformationValue, Nullable(UInt8), UInt8)AggregateFunction(categoricalInformationValue, Nullable(UInt8), UInt8)', 'AggregateFunction(min, String)')); -- { serverError 131 } + +-- Value from hex(minState('0123456789012345678901234567890123456789012345678901234567890123')). Size 63 + 1 (64) +SELECT finalizeAggregation(CAST(unhex('4000000030313233343536373839303132333435363738393031323334353637383930313233343536373839303132333435363738393031323334353637383930313233'), + 'AggregateFunction(min, String)')); From 03968eb694e414281ac5b0529edd2785b266ef59 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 15 Nov 2022 16:16:04 +0000 Subject: [PATCH 36/78] Better message in wait_zookeeper_to_start --- tests/integration/helpers/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 666833013c8..a0e8e0b0cce 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -2070,10 +2070,10 @@ class ClickHouseCluster: logging.debug("All instances of ZooKeeper started") return except Exception as ex: - logging.debug("Can't connect to ZooKeeper " + str(ex)) + logging.debug(f"Can't connect to ZooKeeper {instance}: {ex}") time.sleep(0.5) - raise Exception("Cannot wait ZooKeeper container") + raise Exception("Cannot wait ZooKeeper container (probably it's a `iptables-nft` issue, you may try to `sudo iptables -P FORWARD ACCEPT`)") def make_hdfs_api(self, timeout=180, kerberized=False): if kerberized: From aaeeeaf1fc1c58d1d96521d9c604a213cb8cd3c4 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 15 Nov 2022 16:40:01 +0000 Subject: [PATCH 37/78] Automatic style fix --- tests/integration/helpers/cluster.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index a0e8e0b0cce..a190126a8ff 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -2073,7 +2073,9 @@ class ClickHouseCluster: logging.debug(f"Can't connect to ZooKeeper {instance}: {ex}") time.sleep(0.5) - raise Exception("Cannot wait ZooKeeper container (probably it's a `iptables-nft` issue, you may try to `sudo iptables -P FORWARD ACCEPT`)") + raise Exception( + "Cannot wait ZooKeeper container (probably it's a `iptables-nft` issue, you may try to `sudo iptables -P FORWARD ACCEPT`)" + ) def make_hdfs_api(self, timeout=180, kerberized=False): if kerberized: From 4778b5c13f7aeb2602047a790fcf640251b4c2b9 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Tue, 15 Nov 2022 20:23:49 +0000 Subject: [PATCH 38/78] Fix IS (NOT)NULL operator --- src/Parsers/ExpressionListParsers.cpp | 60 +++++++++---------- .../02477_is_null_parser.reference | 3 + .../0_stateless/02477_is_null_parser.sql | 3 + 3 files changed, 36 insertions(+), 30 deletions(-) create mode 100644 tests/queries/0_stateless/02477_is_null_parser.reference create mode 100644 tests/queries/0_stateless/02477_is_null_parser.sql diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index d29aa248ec4..29158254e88 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -2199,40 +2199,40 @@ std::vector> ParserExpressionImpl::operators_t {"AND", Operator("and", 4, 2, OperatorType::Mergeable)}, {"BETWEEN", Operator("", 6, 0, OperatorType::StartBetween)}, {"NOT BETWEEN", Operator("", 6, 0, OperatorType::StartNotBetween)}, - {"IS NULL", Operator("isNull", 8, 1, OperatorType::IsNull)}, - {"IS NOT NULL", Operator("isNotNull", 8, 1, OperatorType::IsNull)}, - {"==", Operator("equals", 9, 2, OperatorType::Comparison)}, - {"!=", Operator("notEquals", 9, 2, OperatorType::Comparison)}, - {"<>", Operator("notEquals", 9, 2, OperatorType::Comparison)}, - {"<=", Operator("lessOrEquals", 9, 2, OperatorType::Comparison)}, - {">=", Operator("greaterOrEquals", 9, 2, OperatorType::Comparison)}, - {"<", Operator("less", 9, 2, OperatorType::Comparison)}, - {">", Operator("greater", 9, 2, OperatorType::Comparison)}, - {"=", Operator("equals", 9, 2, OperatorType::Comparison)}, - {"LIKE", Operator("like", 9, 2)}, - {"ILIKE", Operator("ilike", 9, 2)}, - {"NOT LIKE", Operator("notLike", 9, 2)}, - {"NOT ILIKE", Operator("notILike", 9, 2)}, - {"IN", Operator("in", 9, 2)}, - {"NOT IN", Operator("notIn", 9, 2)}, - {"GLOBAL IN", Operator("globalIn", 9, 2)}, - {"GLOBAL NOT IN", Operator("globalNotIn", 9, 2)}, - {"||", Operator("concat", 10, 2, OperatorType::Mergeable)}, - {"+", Operator("plus", 11, 2)}, - {"-", Operator("minus", 11, 2)}, - {"*", Operator("multiply", 12, 2)}, - {"/", Operator("divide", 12, 2)}, - {"%", Operator("modulo", 12, 2)}, - {"MOD", Operator("modulo", 12, 2)}, - {"DIV", Operator("intDiv", 12, 2)}, - {".", Operator("tupleElement", 14, 2, OperatorType::TupleElement)}, - {"[", Operator("arrayElement", 14, 2, OperatorType::ArrayElement)}, - {"::", Operator("CAST", 14, 2, OperatorType::Cast)}, + {"==", Operator("equals", 8, 2, OperatorType::Comparison)}, + {"!=", Operator("notEquals", 8, 2, OperatorType::Comparison)}, + {"<>", Operator("notEquals", 8, 2, OperatorType::Comparison)}, + {"<=", Operator("lessOrEquals", 8, 2, OperatorType::Comparison)}, + {">=", Operator("greaterOrEquals", 8, 2, OperatorType::Comparison)}, + {"<", Operator("less", 8, 2, OperatorType::Comparison)}, + {">", Operator("greater", 8, 2, OperatorType::Comparison)}, + {"=", Operator("equals", 8, 2, OperatorType::Comparison)}, + {"LIKE", Operator("like", 8, 2)}, + {"ILIKE", Operator("ilike", 8, 2)}, + {"NOT LIKE", Operator("notLike", 8, 2)}, + {"NOT ILIKE", Operator("notILike", 8, 2)}, + {"IN", Operator("in", 8, 2)}, + {"NOT IN", Operator("notIn", 8, 2)}, + {"GLOBAL IN", Operator("globalIn", 8, 2)}, + {"GLOBAL NOT IN", Operator("globalNotIn", 8, 2)}, + {"||", Operator("concat", 9, 2, OperatorType::Mergeable)}, + {"+", Operator("plus", 10, 2)}, + {"-", Operator("minus", 10, 2)}, + {"*", Operator("multiply", 11, 2)}, + {"/", Operator("divide", 11, 2)}, + {"%", Operator("modulo", 11, 2)}, + {"MOD", Operator("modulo", 11, 2)}, + {"DIV", Operator("intDiv", 11, 2)}, + {".", Operator("tupleElement", 13, 2, OperatorType::TupleElement)}, + {"[", Operator("arrayElement", 13, 2, OperatorType::ArrayElement)}, + {"::", Operator("CAST", 13, 2, OperatorType::Cast)}, + {"IS NULL", Operator("isNull", 13, 1, OperatorType::IsNull)}, + {"IS NOT NULL", Operator("isNotNull", 13, 1, OperatorType::IsNull)}, }); std::vector> ParserExpressionImpl::unary_operators_table({ {"NOT", Operator("not", 5, 1)}, - {"-", Operator("negate", 13, 1)} + {"-", Operator("negate", 12, 1)} }); Operator ParserExpressionImpl::finish_between_operator = Operator("", 7, 0, OperatorType::FinishBetween); diff --git a/tests/queries/0_stateless/02477_is_null_parser.reference b/tests/queries/0_stateless/02477_is_null_parser.reference new file mode 100644 index 00000000000..57d96862011 --- /dev/null +++ b/tests/queries/0_stateless/02477_is_null_parser.reference @@ -0,0 +1,3 @@ +SELECT (\'a\' IS NULL) + (\'b\' IS NOT NULL) +SELECT (\'a\' IS NULL) = 0 +SELECT CAST(1 IS NULL, \'Int32\') diff --git a/tests/queries/0_stateless/02477_is_null_parser.sql b/tests/queries/0_stateless/02477_is_null_parser.sql new file mode 100644 index 00000000000..b95a35fde21 --- /dev/null +++ b/tests/queries/0_stateless/02477_is_null_parser.sql @@ -0,0 +1,3 @@ +EXPLAIN SYNTAX SELECT 'a' IS NULL + 'b' IS NOT NULL; +EXPLAIN SYNTAX SELECT 'a' IS NULL = 0; +EXPLAIN SYNTAX SELECT 1 IS NULL :: Int32; From fcdc9dfaacfb1d473c142cad7303d9c12e170a1b Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 15 Nov 2022 20:52:18 +0000 Subject: [PATCH 39/78] Make test_global_overcommit_tracker non-parallel --- tests/integration/parallel_skip.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/parallel_skip.json b/tests/integration/parallel_skip.json index 3b4d1f2f29a..ba593b76bbf 100644 --- a/tests/integration/parallel_skip.json +++ b/tests/integration/parallel_skip.json @@ -48,6 +48,8 @@ "test_system_replicated_fetches/test.py::test_system_replicated_fetches", "test_zookeeper_config_load_balancing/test.py::test_round_robin", + "test_global_overcommit_tracker/test.py::test_global_overcommit", + "test_user_ip_restrictions/test.py::test_ipv4", "test_user_ip_restrictions/test.py::test_ipv6" ] From 8d72f75556bbbb7dbe5675324bb3e00634480492 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 11 Nov 2022 16:26:04 +0100 Subject: [PATCH 40/78] Make the password wiping an option of IAST::format(). --- programs/copier/TaskTable.cpp | 2 +- src/Analyzer/FunctionNode.cpp | 7 +- src/Backups/BackupInfo.cpp | 5 +- src/Backups/BackupSettings.cpp | 7 +- src/Backups/BackupsWorker.cpp | 1 + src/Backups/RestoreSettings.cpp | 8 +- src/Common/SensitiveDataMasker.cpp | 25 + src/Common/SensitiveDataMasker.h | 4 + .../Access/InterpreterShowAccessQuery.cpp | 7 +- ...InterpreterShowCreateAccessEntityQuery.cpp | 14 +- src/Interpreters/DDLTask.cpp | 5 +- src/Interpreters/SystemLog.cpp | 2 +- src/Interpreters/executeQuery.cpp | 15 +- src/Parsers/ASTBackupQuery.cpp | 8 +- src/Parsers/ASTBackupQuery.h | 5 +- src/Parsers/ASTFunction.cpp | 36 +- src/Parsers/ASTFunction.h | 14 + .../ASTFunctionWithKeyValueArguments.cpp | 17 +- .../ASTFunctionWithKeyValueArguments.h | 2 + src/Parsers/Access/ASTCreateUserQuery.cpp | 20 +- src/Parsers/Access/ASTCreateUserQuery.h | 2 +- src/Parsers/ExpressionListParsers.cpp | 8 +- src/Parsers/IAST.cpp | 25 +- src/Parsers/IAST.h | 29 +- src/Parsers/ParserBackupQuery.cpp | 16 +- src/Parsers/ParserCreateQuery.cpp | 28 +- src/Parsers/ParserCreateQuery.h | 11 + src/Parsers/findFunctionSecretArguments.cpp | 348 +++++++++++ src/Parsers/findFunctionSecretArguments.h | 16 + .../maskSensitiveInfoInQueryForLogging.cpp | 579 ------------------ .../maskSensitiveInfoInQueryForLogging.h | 19 - 31 files changed, 607 insertions(+), 678 deletions(-) create mode 100644 src/Parsers/findFunctionSecretArguments.cpp create mode 100644 src/Parsers/findFunctionSecretArguments.h delete mode 100644 src/Parsers/maskSensitiveInfoInQueryForLogging.cpp delete mode 100644 src/Parsers/maskSensitiveInfoInQueryForLogging.h diff --git a/programs/copier/TaskTable.cpp b/programs/copier/TaskTable.cpp index 5b09a9c99a7..65eaf8b7108 100644 --- a/programs/copier/TaskTable.cpp +++ b/programs/copier/TaskTable.cpp @@ -45,7 +45,7 @@ TaskTable::TaskTable(TaskCluster & parent, const Poco::Util::AbstractConfigurati engine_push_str = config.getString(table_prefix + "engine", "rand()"); { - ParserStorage parser_storage; + ParserStorage parser_storage{ParserStorage::TABLE_ENGINE}; engine_push_ast = parseQuery(parser_storage, engine_push_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH); engine_push_partition_key_ast = extractPartitionKey(engine_push_ast); primary_key_comma_separated = boost::algorithm::join(extractPrimaryKeyColumnNames(engine_push_ast), ", "); diff --git a/src/Analyzer/FunctionNode.cpp b/src/Analyzer/FunctionNode.cpp index 55e097cc5f4..adc5782f7b7 100644 --- a/src/Analyzer/FunctionNode.cpp +++ b/src/Analyzer/FunctionNode.cpp @@ -166,7 +166,12 @@ ASTPtr FunctionNode::toASTImpl() const auto function_ast = std::make_shared(); function_ast->name = function_name; - function_ast->is_window_function = isWindowFunction(); + + if (isWindowFunction()) + { + function_ast->is_window_function = true; + function_ast->kind = ASTFunction::Kind::WINDOW_FUNCTION; + } const auto & parameters = getParameters(); if (!parameters.getNodes().empty()) diff --git a/src/Backups/BackupInfo.cpp b/src/Backups/BackupInfo.cpp index 009c262ad67..334bc5b5892 100644 --- a/src/Backups/BackupInfo.cpp +++ b/src/Backups/BackupInfo.cpp @@ -6,7 +6,6 @@ #include #include #include -#include namespace DB @@ -36,6 +35,7 @@ ASTPtr BackupInfo::toAST() const auto func = std::make_shared(); func->name = backup_engine_name; func->no_empty_args = true; + func->kind = ASTFunction::Kind::BACKUP_NAME; auto list = std::make_shared(); func->arguments = list; @@ -95,8 +95,7 @@ BackupInfo BackupInfo::fromAST(const IAST & ast) String BackupInfo::toStringForLogging() const { - ASTPtr ast = toAST(); - return maskSensitiveInfoInBackupNameForLogging(serializeAST(*ast), ast); + return toAST()->formatForLogging(); } } diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index 295ab723326..2179a572bcd 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -126,7 +126,12 @@ void BackupSettings::copySettingsToQuery(ASTBackupQuery & query) const query.settings = query_settings; - query.base_backup_name = base_backup_info ? base_backup_info->toAST() : nullptr; + auto base_backup_name = base_backup_info ? base_backup_info->toAST() : nullptr; + if (base_backup_name) + query.setOrReplace(query.base_backup_name, std::move(base_backup_name)); + else + query.reset(query.base_backup_name); + query.cluster_host_ids = !cluster_host_ids.empty() ? Util::clusterHostIDsToAST(cluster_host_ids) : nullptr; } diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 5be2dde9a3d..704562488b1 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp index 63915670fa4..322954abc2b 100644 --- a/src/Backups/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -213,7 +214,12 @@ void RestoreSettings::copySettingsToQuery(ASTBackupQuery & query) const query.settings = query_settings; - query.base_backup_name = base_backup_info ? base_backup_info->toAST() : nullptr; + auto base_backup_name = base_backup_info ? base_backup_info->toAST() : nullptr; + if (base_backup_name) + query.setOrReplace(query.base_backup_name, std::move(base_backup_name)); + else + query.reset(query.base_backup_name); + query.cluster_host_ids = !cluster_host_ids.empty() ? BackupSettings::Util::clusterHostIDsToAST(cluster_host_ids) : nullptr; } diff --git a/src/Common/SensitiveDataMasker.cpp b/src/Common/SensitiveDataMasker.cpp index 15ae6961513..0c2e71a71f3 100644 --- a/src/Common/SensitiveDataMasker.cpp +++ b/src/Common/SensitiveDataMasker.cpp @@ -13,12 +13,19 @@ #include #include +#include #ifndef NDEBUG # include #endif +namespace ProfileEvents +{ + extern const Event QueryMaskingRulesMatch; +} + + namespace DB { namespace ErrorCodes @@ -165,6 +172,10 @@ size_t SensitiveDataMasker::wipeSensitiveData(std::string & data) const size_t matches = 0; for (const auto & rule : all_masking_rules) matches += rule->apply(data); + + if (matches) + ProfileEvents::increment(ProfileEvents::QueryMaskingRulesMatch, matches); + return matches; } @@ -184,4 +195,18 @@ size_t SensitiveDataMasker::rulesCount() const return all_masking_rules.size(); } + +std::string wipeSensitiveDataAndCutToLength(const std::string & str, size_t max_length) +{ + std::string res = str; + + if (auto * masker = SensitiveDataMasker::getInstance()) + masker->wipeSensitiveData(res); + + if (max_length && (res.length() > max_length)) + res.resize(max_length); + + return res; +} + } diff --git a/src/Common/SensitiveDataMasker.h b/src/Common/SensitiveDataMasker.h index adb6f5d51e1..031956cb926 100644 --- a/src/Common/SensitiveDataMasker.h +++ b/src/Common/SensitiveDataMasker.h @@ -69,4 +69,8 @@ public: size_t rulesCount() const; }; +/// Wipes sensitive data and cuts to a specified maximum length in one function call. +/// If the maximum length is zero then the function doesn't cut to the maximum length. +std::string wipeSensitiveDataAndCutToLength(const std::string & str, size_t max_length = 0); + } diff --git a/src/Interpreters/Access/InterpreterShowAccessQuery.cpp b/src/Interpreters/Access/InterpreterShowAccessQuery.cpp index d1d8ee63b8e..cafe5df104e 100644 --- a/src/Interpreters/Access/InterpreterShowAccessQuery.cpp +++ b/src/Interpreters/Access/InterpreterShowAccessQuery.cpp @@ -32,13 +32,8 @@ QueryPipeline InterpreterShowAccessQuery::executeImpl() const /// Build the result column. MutableColumnPtr column = ColumnString::create(); - WriteBufferFromOwnString buf; for (const auto & query : queries) - { - buf.restart(); - formatAST(*query, buf, false, true); - column->insert(buf.str()); - } + column->insert(query->formatWithHiddenSecrets()); String desc = "ACCESS"; return QueryPipeline(std::make_shared(Block{{std::move(column), std::make_shared(), desc}})); diff --git a/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp b/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp index a6c4df3a86d..3a36008aa0a 100644 --- a/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp +++ b/src/Interpreters/Access/InterpreterShowCreateAccessEntityQuery.cpp @@ -62,10 +62,7 @@ namespace } if (user.auth_data.getType() != AuthenticationType::NO_PASSWORD) - { query->auth_data = user.auth_data; - query->show_password = attach_mode; /// We don't show password unless it's an ATTACH statement. - } if (!user.settings.empty()) { @@ -256,19 +253,12 @@ QueryPipeline InterpreterShowCreateAccessEntityQuery::executeImpl() /// Build the result column. MutableColumnPtr column = ColumnString::create(); - WriteBufferFromOwnString create_query_buf; for (const auto & create_query : create_queries) - { - formatAST(*create_query, create_query_buf, false, true); - column->insert(create_query_buf.str()); - create_query_buf.restart(); - } + column->insert(create_query->formatWithHiddenSecrets()); /// Prepare description of the result column. - WriteBufferFromOwnString desc_buf; const auto & show_query = query_ptr->as(); - formatAST(show_query, desc_buf, false, true); - String desc = desc_buf.str(); + String desc = serializeAST(show_query); String prefix = "SHOW "; if (startsWith(desc, prefix)) desc = desc.substr(prefix.length()); /// `desc` always starts with "SHOW ", so we can trim this prefix. diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 331cc5092c5..1c551dc89e0 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -15,7 +15,6 @@ #include #include #include -#include namespace DB @@ -170,11 +169,11 @@ void DDLTaskBase::parseQueryFromEntry(ContextPtr context) query = parseQuery(parser_query, begin, end, description, 0, settings.max_parser_depth); } -void DDLTaskBase::formatRewrittenQuery(ContextPtr) +void DDLTaskBase::formatRewrittenQuery(ContextPtr context) { /// Convert rewritten AST back to string. query_str = queryToString(*query); - query_for_logging = maskSensitiveInfoInQueryForLogging(query_str, query); + query_for_logging = query->formatForLogging(context->getSettingsRef().log_queries_cut_to_length); } ContextMutablePtr DDLTaskBase::makeQueryContext(ContextPtr from_context, const ZooKeeperPtr & /*zookeeper*/) diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp index b5e4a0e81c7..2cd177edd40 100644 --- a/src/Interpreters/SystemLog.cpp +++ b/src/Interpreters/SystemLog.cpp @@ -75,7 +75,7 @@ namespace const char * getName() const override { return "storage definition with comment"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override { - ParserStorage storage_p; + ParserStorage storage_p{ParserStorage::TABLE_ENGINE}; ASTPtr storage; if (!storage_p.parse(pos, storage, expected)) diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index e1966597e7d..b44db316f90 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -55,7 +56,6 @@ #include #include #include -#include #include #include @@ -352,6 +352,7 @@ static std::tuple executeQueryImpl( ASTPtr ast; String query; String query_for_logging; + size_t log_queries_cut_to_length = context->getSettingsRef().log_queries_cut_to_length; /// Parse the query from string. try @@ -392,15 +393,23 @@ static std::tuple executeQueryImpl( /// MUST go before any modification (except for prepared statements, /// since it substitute parameters and without them query does not contain /// parameters), to keep query as-is in query_log and server log. - query_for_logging = maskSensitiveInfoInQueryForLogging(query, ast); + if (ast->hasSecretParts()) + { + /// IAST::formatForLogging() wipes secret parts in AST and then calls wipeSensitiveDataAndCutToLength(). + query_for_logging = ast->formatForLogging(log_queries_cut_to_length); + } + else + { + query_for_logging = wipeSensitiveDataAndCutToLength(query, log_queries_cut_to_length); + } } catch (...) { /// Anyway log the query. if (query.empty()) query.assign(begin, std::min(end - begin, static_cast(max_query_size))); - query_for_logging = maskSensitiveInfoInQueryForLogging(query, ast); + query_for_logging = wipeSensitiveDataAndCutToLength(query, log_queries_cut_to_length); logQuery(query_for_logging, context, internal, stage); if (!internal) diff --git a/src/Parsers/ASTBackupQuery.cpp b/src/Parsers/ASTBackupQuery.cpp index 567b52b5669..85a0a0c7799 100644 --- a/src/Parsers/ASTBackupQuery.cpp +++ b/src/Parsers/ASTBackupQuery.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -141,7 +142,7 @@ namespace } } - void formatSettings(const ASTPtr & settings, const ASTPtr & base_backup_name, const ASTPtr & cluster_host_ids, const IAST::FormatSettings & format) + void formatSettings(const ASTPtr & settings, const ASTFunction * base_backup_name, const ASTPtr & cluster_host_ids, const IAST::FormatSettings & format) { if (!settings && !base_backup_name && !cluster_host_ids) return; @@ -246,12 +247,13 @@ String ASTBackupQuery::getID(char) const ASTPtr ASTBackupQuery::clone() const { auto res = std::make_shared(*this); + res->children.clear(); if (backup_name) - res->backup_name = backup_name->clone(); + res->set(res->backup_name, backup_name->clone()); if (base_backup_name) - res->base_backup_name = base_backup_name->clone(); + res->set(res->base_backup_name, base_backup_name->clone()); if (cluster_host_ids) res->cluster_host_ids = cluster_host_ids->clone(); diff --git a/src/Parsers/ASTBackupQuery.h b/src/Parsers/ASTBackupQuery.h index dd094b77aae..708f5bf468f 100644 --- a/src/Parsers/ASTBackupQuery.h +++ b/src/Parsers/ASTBackupQuery.h @@ -8,6 +8,7 @@ namespace DB { using Strings = std::vector; using DatabaseAndTableName = std::pair; +class ASTFunction; /** BACKUP { TABLE [db.]table_name [AS [db.]table_name_in_backup] [PARTITION[S] partition_expr [,...]] | @@ -77,13 +78,13 @@ public: Elements elements; - ASTPtr backup_name; + ASTFunction * backup_name = nullptr; ASTPtr settings; /// Base backup. Only differences made after the base backup will be included in a newly created backup, /// so this setting allows to make an incremental backup. - ASTPtr base_backup_name; + ASTFunction * base_backup_name = nullptr; /// List of cluster's hosts' IDs if this is a BACKUP/RESTORE ON CLUSTER command. ASTPtr cluster_host_ids; diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index 63dc9f6b3ac..3467b8b5ebc 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -629,6 +630,10 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format && (name == "match" || name == "extract" || name == "extractAll" || name == "replaceRegexpOne" || name == "replaceRegexpAll"); + auto secret_arguments = std::make_pair(static_cast(-1), static_cast(-1)); + if (!settings.show_secrets) + secret_arguments = findFunctionSecretArguments(*this); + for (size_t i = 0, size = arguments->children.size(); i < size; ++i) { if (i != 0) @@ -636,12 +641,21 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format if (arguments->children[i]->as()) settings.ostr << "SETTINGS "; - bool special_hilite = false; - if (i == 1 && special_hilite_regexp) - special_hilite = highlightStringLiteralWithMetacharacters(arguments->children[i], settings, "|()^$.[]?*+{:-"); + if ((i == 1) && special_hilite_regexp + && highlightStringLiteralWithMetacharacters(arguments->children[i], settings, "|()^$.[]?*+{:-")) + { + continue; + } - if (!special_hilite) - arguments->children[i]->formatImpl(settings, state, nested_dont_need_parens); + if (!settings.show_secrets && (secret_arguments.first <= i) && (i < secret_arguments.second)) + { + settings.ostr << "'[HIDDEN]'"; + if (size - 1 < secret_arguments.second) + break; /// All other arguments should also be hidden. + continue; + } + + arguments->children[i]->formatImpl(settings, state, nested_dont_need_parens); } } @@ -653,6 +667,18 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format return finishFormatWithWindow(settings, state, frame); } +bool ASTFunction::hasSecretParts() const +{ + if (arguments) + { + size_t num_arguments = arguments->children.size(); + auto secret_arguments = findFunctionSecretArguments(*this); + if ((secret_arguments.first < num_arguments) && (secret_arguments.first < secret_arguments.second)) + return true; + } + return childrenHaveSecretParts(); +} + String getFunctionName(const IAST * ast) { String res; diff --git a/src/Parsers/ASTFunction.h b/src/Parsers/ASTFunction.h index 5756fb9ba86..052325fcf61 100644 --- a/src/Parsers/ASTFunction.h +++ b/src/Parsers/ASTFunction.h @@ -42,6 +42,18 @@ public: /// do not print empty parentheses if there are no args - compatibility with new AST for data types and engine names. bool no_empty_args = false; + /// Kind of this function-like expression, how it's actually used. + enum class Kind + { + ORDINARY_FUNCTION, + WINDOW_FUNCTION, + LAMBDA_FUNCTION, + TABLE_ENGINE, + DATABASE_ENGINE, + BACKUP_NAME, + }; + Kind kind = Kind::ORDINARY_FUNCTION; + /** Get text identifying the AST node. */ String getID(char delim) const override; @@ -55,6 +67,8 @@ public: std::string getWindowDescription() const; + bool hasSecretParts() const override; + protected: void formatImplWithoutAlias(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; void appendColumnNameImpl(WriteBuffer & ostr) const override; diff --git a/src/Parsers/ASTFunctionWithKeyValueArguments.cpp b/src/Parsers/ASTFunctionWithKeyValueArguments.cpp index d94490ab8b3..2c28e342610 100644 --- a/src/Parsers/ASTFunctionWithKeyValueArguments.cpp +++ b/src/Parsers/ASTFunctionWithKeyValueArguments.cpp @@ -29,7 +29,16 @@ void ASTPair::formatImpl(const FormatSettings & settings, FormatState & state, F if (second_with_brackets) settings.ostr << (settings.hilite ? hilite_keyword : "") << "("; - second->formatImpl(settings, state, frame); + if (!settings.show_secrets && (first == "password")) + { + /// Hide password in the definition of a dictionary: + /// SOURCE(CLICKHOUSE(host 'example01-01-1' port 9000 user 'default' password '[HIDDEN]' db 'default' table 'ids')) + settings.ostr << "'[HIDDEN]'"; + } + else + { + second->formatImpl(settings, state, frame); + } if (second_with_brackets) settings.ostr << (settings.hilite ? hilite_keyword : "") << ")"; @@ -38,6 +47,12 @@ void ASTPair::formatImpl(const FormatSettings & settings, FormatState & state, F } +bool ASTPair::hasSecretParts() const +{ + return first == "password"; +} + + void ASTPair::updateTreeHashImpl(SipHash & hash_state) const { hash_state.update(first.size()); diff --git a/src/Parsers/ASTFunctionWithKeyValueArguments.h b/src/Parsers/ASTFunctionWithKeyValueArguments.h index 4b745e2c1a2..e54bc9288f2 100644 --- a/src/Parsers/ASTFunctionWithKeyValueArguments.h +++ b/src/Parsers/ASTFunctionWithKeyValueArguments.h @@ -29,6 +29,8 @@ public: ASTPtr clone() const override; void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; + + bool hasSecretParts() const override; void updateTreeHashImpl(SipHash & hash_state) const override; }; diff --git a/src/Parsers/Access/ASTCreateUserQuery.cpp b/src/Parsers/Access/ASTCreateUserQuery.cpp index b4eaf08856e..a59b5dd472c 100644 --- a/src/Parsers/Access/ASTCreateUserQuery.cpp +++ b/src/Parsers/Access/ASTCreateUserQuery.cpp @@ -23,7 +23,7 @@ namespace } - void formatAuthenticationData(const AuthenticationData & auth_data, bool show_password, const IAST::FormatSettings & settings) + void formatAuthenticationData(const AuthenticationData & auth_data, const IAST::FormatSettings & settings) { auto auth_type = auth_data.getType(); if (auth_type == AuthenticationType::NO_PASSWORD) @@ -93,7 +93,7 @@ namespace throw Exception("AST: Unexpected authentication type " + toString(auth_type), ErrorCodes::LOGICAL_ERROR); } - if (password && !show_password) + if (password && !settings.show_secrets) { prefix = ""; password.reset(); @@ -324,7 +324,7 @@ void ASTCreateUserQuery::formatImpl(const FormatSettings & format, FormatState & formatRenameTo(*new_name, format); if (auth_data) - formatAuthenticationData(*auth_data, show_password, format); + formatAuthenticationData(*auth_data, format); if (hosts) formatHosts(nullptr, *hosts, format); @@ -345,4 +345,18 @@ void ASTCreateUserQuery::formatImpl(const FormatSettings & format, FormatState & if (grantees) formatGrantees(*grantees, format); } + +bool ASTCreateUserQuery::hasSecretParts() const +{ + if (auth_data) + { + auto auth_type = auth_data->getType(); + if ((auth_type == AuthenticationType::PLAINTEXT_PASSWORD) + || (auth_type == AuthenticationType::SHA256_PASSWORD) + || (auth_type == AuthenticationType::DOUBLE_SHA1_PASSWORD)) + return true; + } + return childrenHaveSecretParts(); +} + } diff --git a/src/Parsers/Access/ASTCreateUserQuery.h b/src/Parsers/Access/ASTCreateUserQuery.h index 32d53b1bba7..8357b9f2b6c 100644 --- a/src/Parsers/Access/ASTCreateUserQuery.h +++ b/src/Parsers/Access/ASTCreateUserQuery.h @@ -45,7 +45,6 @@ public: std::optional new_name; std::optional auth_data; - bool show_password = true; /// formatImpl() shows a password or hash by default std::optional hosts; std::optional add_hosts; @@ -60,6 +59,7 @@ public: String getID(char) const override; ASTPtr clone() const override; void formatImpl(const FormatSettings & format, FormatState &, FormatStateStacked) const override; + bool hasSecretParts() const override; ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } }; } diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index d29aa248ec4..2d7c4892f6c 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -494,7 +494,12 @@ template static std::shared_ptr makeASTFunction(Operator & op, Args &&... args) { auto ast_function = makeASTFunction(op.function_name, std::forward(args)...); - ast_function->is_lambda_function = op.type == OperatorType::Lambda; + + if (op.type == OperatorType::Lambda) + { + ast_function->is_lambda_function = true; + ast_function->kind = ASTFunction::Kind::LAMBDA_FUNCTION; + } return ast_function; } @@ -999,6 +1004,7 @@ public: if (over.ignore(pos, expected)) { function_node->is_window_function = true; + function_node->kind = ASTFunction::Kind::WINDOW_FUNCTION; ASTPtr function_node_as_iast = function_node; diff --git a/src/Parsers/IAST.cpp b/src/Parsers/IAST.cpp index 65740f290b5..7b1f8fe51f6 100644 --- a/src/Parsers/IAST.cpp +++ b/src/Parsers/IAST.cpp @@ -1,8 +1,10 @@ +#include + #include #include #include +#include #include -#include namespace DB @@ -165,11 +167,26 @@ size_t IAST::checkDepthImpl(size_t max_depth) const return res; } -std::string IAST::formatForErrorMessage() const +String IAST::formatWithHiddenSecrets(size_t max_length, bool one_line, bool hilite) const { WriteBufferFromOwnString buf; - format(FormatSettings(buf, true /* one line */)); - return buf.str(); + + FormatSettings settings{buf, one_line}; + settings.hilite = hilite; + settings.show_secrets = false; + format(settings); + + return wipeSensitiveDataAndCutToLength(buf.str(), max_length); +} + +bool IAST::childrenHaveSecretParts() const +{ + for (const auto & child : children) + { + if (child->hasSecretParts()) + return true; + } + return false; } void IAST::cloneChildren() diff --git a/src/Parsers/IAST.h b/src/Parsers/IAST.h index bba878e4769..68e086b16b6 100644 --- a/src/Parsers/IAST.h +++ b/src/Parsers/IAST.h @@ -185,6 +185,7 @@ public: bool one_line; bool always_quote_identifiers = false; IdentifierQuotingStyle identifier_quoting_style = IdentifierQuotingStyle::Backticks; + bool show_secrets = true; /// Show secret parts of the AST (e.g. passwords, encryption keys). // Newline or whitespace. char nl_or_ws; @@ -240,9 +241,12 @@ public: } // A simple way to add some user-readable context to an error message. - std::string formatForErrorMessage() const; - template - static std::string formatForErrorMessage(const AstArray & array); + String formatWithHiddenSecrets(size_t max_length = 0, bool one_line = true, bool hilite = false) const; + String formatForLogging(size_t max_length = 0) const { return formatWithHiddenSecrets(max_length, true, false); } + String formatForErrorMessage() const { return formatWithHiddenSecrets(0, true, false); } + + /// If an AST has secret parts then formatForLogging() will replace them with the placeholder '[HIDDEN]'. + virtual bool hasSecretParts() const { return childrenHaveSecretParts(); } void cloneChildren(); @@ -272,6 +276,9 @@ public: static const char * hilite_substitution; static const char * hilite_none; +protected: + bool childrenHaveSecretParts() const; + private: size_t checkDepthImpl(size_t max_depth) const; @@ -282,20 +289,4 @@ private: ASTPtr * next_to_delete_list_head = nullptr; }; -template -std::string IAST::formatForErrorMessage(const AstArray & array) -{ - WriteBufferFromOwnString buf; - for (size_t i = 0; i < array.size(); ++i) - { - if (i > 0) - { - const char * delim = ", "; - buf.write(delim, strlen(delim)); - } - array[i]->format(IAST::FormatSettings(buf, true /* one line */)); - } - return buf.str(); -} - } diff --git a/src/Parsers/ParserBackupQuery.cpp b/src/Parsers/ParserBackupQuery.cpp index 649304b1dab..4a8eb2b5802 100644 --- a/src/Parsers/ParserBackupQuery.cpp +++ b/src/Parsers/ParserBackupQuery.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -207,7 +208,11 @@ namespace bool parseBackupName(IParser::Pos & pos, Expected & expected, ASTPtr & backup_name) { - return ParserIdentifierWithOptionalParameters{}.parse(pos, backup_name, expected); + if (!ParserIdentifierWithOptionalParameters{}.parse(pos, backup_name, expected)) + return false; + + backup_name->as().kind = ASTFunction::Kind::BACKUP_NAME; + return true; } bool parseBaseBackupSetting(IParser::Pos & pos, Expected & expected, ASTPtr & base_backup_name) @@ -358,11 +363,16 @@ bool ParserBackupQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) query->kind = kind; query->elements = std::move(elements); query->cluster = std::move(cluster); - query->backup_name = std::move(backup_name); + + if (backup_name) + query->set(query->backup_name, std::move(backup_name)); + query->settings = std::move(settings); - query->base_backup_name = std::move(base_backup_name); query->cluster_host_ids = std::move(cluster_host_ids); + if (base_backup_name) + query->set(query->base_backup_name, std::move(base_backup_name)); + return true; } diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index bf305ba4781..9c1c682ca03 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -442,6 +442,20 @@ bool ParserStorage::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!storage_like) return false; + if (engine) + { + switch (engine_kind) + { + case EngineKind::TABLE_ENGINE: + engine->as().kind = ASTFunction::Kind::TABLE_ENGINE; + break; + + case EngineKind::DATABASE_ENGINE: + engine->as().kind = ASTFunction::Kind::DATABASE_ENGINE; + break; + } + } + auto storage = std::make_shared(); storage->set(storage->engine, engine); storage->set(storage->partition_by, partition_by); @@ -449,7 +463,6 @@ bool ParserStorage::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) storage->set(storage->order_by, order_by); storage->set(storage->sample_by, sample_by); storage->set(storage->ttl_table, ttl_table); - storage->set(storage->settings, settings); node = storage; @@ -473,7 +486,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe ParserToken s_comma(TokenType::Comma); ParserToken s_lparen(TokenType::OpeningRoundBracket); ParserToken s_rparen(TokenType::ClosingRoundBracket); - ParserStorage storage_p; + ParserStorage storage_p{ParserStorage::TABLE_ENGINE}; ParserIdentifier name_p; ParserTablePropertiesDeclarationList table_properties_p; ParserSelectWithUnionQuery select_p; @@ -635,6 +648,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe } } } + auto comment = parseComment(pos, expected); auto query = std::make_shared(); @@ -697,6 +711,8 @@ bool ParserCreateLiveViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e ParserToken s_dot(TokenType::Dot); ParserToken s_lparen(TokenType::OpeningRoundBracket); ParserToken s_rparen(TokenType::ClosingRoundBracket); + ParserStorage storage_p{ParserStorage::TABLE_ENGINE}; + ParserStorage storage_inner{ParserStorage::TABLE_ENGINE}; ParserTablePropertiesDeclarationList table_properties_p; ParserSelectWithUnionQuery select_p; @@ -835,8 +851,8 @@ bool ParserCreateWindowViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & ParserToken s_eq(TokenType::Equals); ParserToken s_lparen(TokenType::OpeningRoundBracket); ParserToken s_rparen(TokenType::ClosingRoundBracket); - ParserStorage storage_p; - ParserStorage storage_inner; + ParserStorage storage_p{ParserStorage::TABLE_ENGINE}; + ParserStorage storage_inner{ParserStorage::TABLE_ENGINE}; ParserTablePropertiesDeclarationList table_properties_p; ParserExpression watermark_p; ParserExpression lateness_p; @@ -1135,7 +1151,7 @@ bool ParserCreateDatabaseQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e ParserKeyword s_attach("ATTACH"); ParserKeyword s_database("DATABASE"); ParserKeyword s_if_not_exists("IF NOT EXISTS"); - ParserStorage storage_p; + ParserStorage storage_p{ParserStorage::DATABASE_ENGINE}; ParserIdentifier name_p(true); ParserTableOverridesDeclarationList table_overrides_p; @@ -1222,7 +1238,7 @@ bool ParserCreateViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec ParserToken s_dot(TokenType::Dot); ParserToken s_lparen(TokenType::OpeningRoundBracket); ParserToken s_rparen(TokenType::ClosingRoundBracket); - ParserStorage storage_p; + ParserStorage storage_p{ParserStorage::TABLE_ENGINE}; ParserIdentifier name_p; ParserTablePropertiesDeclarationList table_properties_p; ParserSelectWithUnionQuery select_p; diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index b5480fa6eae..e1573c92dab 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -420,9 +420,20 @@ protected: */ class ParserStorage : public IParserBase { +public: + /// What kind of engine we're going to parse. + enum EngineKind + { + TABLE_ENGINE, + DATABASE_ENGINE, + }; + + ParserStorage(EngineKind engine_kind_) : engine_kind(engine_kind_) {} + protected: const char * getName() const override { return "storage definition"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; + EngineKind engine_kind; }; /** Query like this: diff --git a/src/Parsers/findFunctionSecretArguments.cpp b/src/Parsers/findFunctionSecretArguments.cpp new file mode 100644 index 00000000000..65c06b7ec97 --- /dev/null +++ b/src/Parsers/findFunctionSecretArguments.cpp @@ -0,0 +1,348 @@ +#include + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + constexpr const std::pair npos{static_cast(-1), static_cast(-1)}; + + bool tryGetStringFromArgument(const ASTFunction & function, size_t arg_idx, String * res, bool allow_literal, bool allow_identifier) + { + if (!function.arguments) + return false; + + const auto * expr_list = function.arguments->as(); + if (!expr_list) + return false; /// return false because we don't want to validate query here + + const auto & arguments = expr_list->children; + if (arg_idx >= arguments.size()) + return false; + + ASTPtr argument = arguments[arg_idx]; + if (allow_literal) + { + if (const auto * literal = argument->as()) + { + if (literal->value.getType() != Field::Types::String) + return false; + if (res) + *res = literal->value.safeGet(); + return true; + } + } + + if (allow_identifier) + { + if (const auto * id = argument->as()) + { + if (res) + *res = id->name(); + return true; + } + } + + return false; + } + + + std::pair findS3FunctionSecretArguments(const ASTFunction & function, bool is_cluster_function) + { + const auto * expr_list = function.arguments->as(); + if (!expr_list) + return npos; /// return because we don't want to validate query here + + const auto & arguments = expr_list->children; + + /// s3Cluster('cluster_name', 'url', ...) has 'url' as its second argument. + size_t url_arg_idx = is_cluster_function ? 1 : 0; + + /// We're going to replace 'aws_secret_access_key' with '[HIDDEN'] for the following signatures: + /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) + /// s3Cluster('cluster_name', 'url', 'aws_access_key_id', 'aws_secret_access_key', 'format', 'compression') + + /// But we should check the number of arguments first because we don't need to do any replacements in case of + /// s3('url' [, 'format']) or s3Cluster('cluster_name', 'url' [, 'format']) + if (arguments.size() < url_arg_idx + 3) + return npos; + + if (arguments.size() >= url_arg_idx + 5) + { + /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format', 'structure', ...) + return {url_arg_idx + 2, url_arg_idx + 3}; + } + else + { + /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) + /// We need to distinguish that from s3('url', 'format', 'structure' [, 'compression_method']). + /// So we will check whether the argument after 'url' is a format. + String format; + if (!tryGetStringFromArgument(function, url_arg_idx + 1, &format, /* allow_literal= */ true, /* allow_identifier= */ false)) + { + /// We couldn't evaluate the argument after 'url' so we don't know whether it is a format or `aws_access_key_id`. + /// So it's safer to wipe the next argument just in case. + return {url_arg_idx + 2, url_arg_idx + 3}; /// Wipe either `aws_secret_access_key` or `structure`. + } + + if (KnownFormatNames::instance().exists(format)) + return npos; /// The argument after 'url' is a format: s3('url', 'format', ...) + + /// The argument after 'url' is not a format so we do our replacement: + /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) -> s3('url', 'aws_access_key_id', '[HIDDEN]', ...) + return {url_arg_idx + 2, url_arg_idx + 3}; + } + } + + + std::pair findRemoteFunctionSecretArguments(const ASTFunction & function) + { + const auto * expr_list = function.arguments->as(); + if (!expr_list) + return npos; /// return because we don't want to validate query here + + const auto & arguments = expr_list->children; + + /// We're going to replace 'password' with '[HIDDEN'] for the following signatures: + /// remote('addresses_expr', db.table, 'user' [, 'password'] [, sharding_key]) + /// remote('addresses_expr', 'db', 'table', 'user' [, 'password'] [, sharding_key]) + /// remote('addresses_expr', table_function(), 'user' [, 'password'] [, sharding_key]) + + /// But we should check the number of arguments first because we don't need to do any replacements in case of + /// remote('addresses_expr', db.table) + if (arguments.size() < 3) + return npos; + + size_t arg_num = 1; + + /// Skip 1 or 2 arguments with table_function() or db.table or 'db', 'table'. + const auto * table_function = arguments[arg_num]->as(); + if (table_function && KnownTableFunctionNames::instance().exists(table_function->name)) + { + ++arg_num; + } + else + { + String database; + if (!tryGetStringFromArgument(function, arg_num, &database, /* allow_literal= */ true, /* allow_identifier= */ true)) + { + /// We couldn't evaluate the argument so we don't know whether it is 'db.table' or just 'db'. + /// Hence we can't figure out whether we should skip one argument 'user' or two arguments 'table', 'user' + /// before the argument 'password'. So it's safer to wipe two arguments just in case. + /// The last argument can be also a `sharding_key`, so we need to check that argument is a literal string + /// before wiping it (because the `password` argument is always a literal string). + auto res = npos; + if (tryGetStringFromArgument(function, arg_num + 2, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false)) + { + /// Wipe either `password` or `user`. + res = {arg_num + 2, arg_num + 3}; + } + if (tryGetStringFromArgument(function, arg_num + 3, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false)) + { + /// Wipe either `password` or `sharding_key`. + if (res == npos) + res.first = arg_num + 3; + res.second = arg_num + 4; + } + return res; + } + + ++arg_num; + auto qualified_name = QualifiedTableName::parseFromString(database); + if (qualified_name.database.empty()) + ++arg_num; /// skip 'table' argument + } + + /// Skip username. + ++arg_num; + + /// Do our replacement: + /// remote('addresses_expr', db.table, 'user', 'password', ...) -> remote('addresses_expr', db.table, 'user', '[HIDDEN]', ...) + /// The last argument can be also a `sharding_key`, so we need to check that argument is a literal string + /// before wiping it (because the `password` argument is always a literal string). + bool can_be_password = tryGetStringFromArgument(function, arg_num, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false); + if (can_be_password) + return {arg_num, arg_num + 1}; + + return npos; + } + + + std::pair findEncryptionFunctionSecretArguments(const ASTFunction & function) + { + const auto * expr_list = function.arguments->as(); + if (!expr_list) + return npos; /// return because we don't want to validate query here + + const auto & arguments = expr_list->children; + + /// We replace all arguments after 'mode' with '[HIDDEN]': + /// encrypt('mode', 'plaintext', 'key' [, iv, aad]) -> encrypt('mode', '[HIDDEN]') + return {1, arguments.size()}; + } + + + std::pair findOrdinaryFunctionSecretArguments(const ASTFunction & function) + { + if (function.name == "mysql") + { + /// mysql('host:port', 'database', 'table', 'user', 'password', ...) + return {4, 5}; + } + else if (function.name == "postgresql") + { + /// postgresql('host:port', 'database', 'table', 'user', 'password', ...) + return {4, 5}; + } + else if (function.name == "mongodb") + { + /// mongodb('host:port', 'database', 'collection', 'user', 'password', ...) + return {4, 5}; + } + else if (function.name == "s3" || function.name == "cosn" || function.name == "oss") + { + /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) + return findS3FunctionSecretArguments(function, /* is_cluster_function= */ false); + } + else if (function.name == "s3Cluster") + { + /// s3Cluster('cluster_name', 'url', 'aws_access_key_id', 'aws_secret_access_key', ...) + return findS3FunctionSecretArguments(function, /* is_cluster_function= */ true); + } + else if (function.name == "remote" || function.name == "remoteSecure") + { + /// remote('addresses_expr', 'db', 'table', 'user', 'password', ...) + return findRemoteFunctionSecretArguments(function); + } + else if ( + function.name == "encrypt" || function.name == "decrypt" || function.name == "aes_encrypt_mysql" + || function.name == "aes_decrypt_mysql" || function.name == "tryDecrypt") + { + /// encrypt('mode', 'plaintext', 'key' [, iv, aad]) + return findEncryptionFunctionSecretArguments(function); + } + else + { + return npos; + } + } + + + std::pair findS3TableEngineSecretArguments(const ASTFunction & function) + { + const auto * expr_list = function.arguments->as(); + if (!expr_list) + return npos; /// return because we don't want to validate query here + + const auto & arguments = expr_list->children; + + /// We replace 'aws_secret_access_key' with '[HIDDEN'] for the following signatures: + /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format') + /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format', 'compression') + + /// But we should check the number of arguments first because we don't need to do that replacements in case of + /// S3('url' [, 'format' [, 'compression']]) + if (arguments.size() < 4) + return npos; + + return {2, 3}; + } + + + std::pair findTableEngineSecretArguments(const ASTFunction & function) + { + const String & engine_name = function.name; + if (engine_name == "ExternalDistributed") + { + /// ExternalDistributed('engine', 'host:port', 'database', 'table', 'user', 'password') + return {5, 6}; + } + else if (engine_name == "MySQL") + { + /// MySQL('host:port', 'database', 'table', 'user', 'password', ...) + return {4, 5}; + } + else if (engine_name == "PostgreSQL") + { + /// PostgreSQL('host:port', 'database', 'table', 'user', 'password', ...) + return {4, 5}; + } + else if (engine_name == "MaterializedPostgreSQL") + { + /// MaterializedPostgreSQL('host:port', 'database', 'table', 'user', 'password', ...) + return {4, 5}; + } + else if (engine_name == "MongoDB") + { + /// MongoDB('host:port', 'database', 'collection', 'user', 'password', ...) + return {4, 5}; + } + else if (engine_name == "S3" || engine_name == "COSN" || engine_name == "OSS") + { + /// S3('url', ['aws_access_key_id', 'aws_secret_access_key',] ...) + return findS3TableEngineSecretArguments(function); + } + else + { + return npos; + } + } + + + std::pair findDatabaseEngineSecretArguments(const ASTFunction & function) + { + const String & engine_name = function.name; + if (engine_name == "MySQL" || engine_name == "MaterializeMySQL" || engine_name == "MaterializedMySQL") + { + /// MySQL('host:port', 'database', 'user', 'password') + return {3, 4}; + } + else if (engine_name == "PostgreSQL" || engine_name == "MaterializedPostgreSQL") + { + /// PostgreSQL('host:port', 'database', 'user', 'password', ...) + return {3, 4}; + } + else + { + return npos; + } + } + + + std::pair findBackupNameSecretArguments(const ASTFunction & function) + { + const String & engine_name = function.name; + if (engine_name == "S3") + { + /// BACKUP ... TO S3(url, [aws_access_key_id, aws_secret_access_key]) + return {2, 3}; + } + else + { + return npos; + } + } +} + +std::pair findFunctionSecretArguments(const ASTFunction & function) +{ + switch (function.kind) + { + case ASTFunction::Kind::ORDINARY_FUNCTION: return findOrdinaryFunctionSecretArguments(function); + case ASTFunction::Kind::WINDOW_FUNCTION: return npos; + case ASTFunction::Kind::LAMBDA_FUNCTION: return npos; + case ASTFunction::Kind::TABLE_ENGINE: return findTableEngineSecretArguments(function); + case ASTFunction::Kind::DATABASE_ENGINE: return findDatabaseEngineSecretArguments(function); + case ASTFunction::Kind::BACKUP_NAME: return findBackupNameSecretArguments(function); + } +} + +} diff --git a/src/Parsers/findFunctionSecretArguments.h b/src/Parsers/findFunctionSecretArguments.h new file mode 100644 index 00000000000..30c00c9466c --- /dev/null +++ b/src/Parsers/findFunctionSecretArguments.h @@ -0,0 +1,16 @@ +#pragma once + +#include + + +namespace DB +{ +class ASTFunction; + +/// Finds arguments of a specified function which should not be displayed for most users for security reasons. +/// That involves passwords and secret keys. +/// The function returns a pair of numbers [first, last) specifying arguments which must be hidden. +/// If the function returns {-1, -1} that means no arguments must be hidden. +std::pair findFunctionSecretArguments(const ASTFunction & function); + +} diff --git a/src/Parsers/maskSensitiveInfoInQueryForLogging.cpp b/src/Parsers/maskSensitiveInfoInQueryForLogging.cpp deleted file mode 100644 index ebadfa5425b..00000000000 --- a/src/Parsers/maskSensitiveInfoInQueryForLogging.cpp +++ /dev/null @@ -1,579 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace ProfileEvents -{ - extern const Event QueryMaskingRulesMatch; -} - - -namespace DB -{ - -namespace -{ - enum class PasswordWipingMode - { - Query, - BackupName, - }; - - - template - class PasswordWipingVisitor - { - public: - struct Data - { - bool can_contain_password = false; - bool password_was_hidden = false; - bool is_create_table_query = false; - bool is_create_database_query = false; - bool is_create_dictionary_query = false; - PasswordWipingMode mode = PasswordWipingMode::Query; - }; - - using Visitor = std::conditional_t< - check_only, - ConstInDepthNodeVisitor, - InDepthNodeVisitor>; - - static bool needChildVisit(const ASTPtr & /* ast */, const ASTPtr & /* child */, Data & data) - { - if constexpr (check_only) - { - return !data.can_contain_password; - } - else - { - return true; - } - } - - static void visit(ASTPtr ast, Data & data) - { - if (auto * create_user_query = ast->as()) - { - visitCreateUserQuery(*create_user_query, data); - } - else if (auto * create_query = ast->as()) - { - visitCreateQuery(*create_query, data); - } - else if (auto * backup_query = ast->as()) - { - visitBackupQuery(*backup_query, data); - } - else if (auto * storage = ast->as()) - { - if (data.is_create_table_query) - visitTableEngine(*storage, data); - else if (data.is_create_database_query) - visitDatabaseEngine(*storage, data); - } - else if (auto * dictionary = ast->as()) - { - if (data.is_create_dictionary_query) - visitDictionaryDef(*dictionary, data); - } - else if (auto * function = ast->as()) - { - if (data.mode == PasswordWipingMode::BackupName) - wipePasswordFromBackupEngineArguments(*function, data); - else - visitFunction(*function, data); - } - } - - private: - static void visitCreateUserQuery(ASTCreateUserQuery & query, Data & data) - { - if (!query.auth_data) - return; - - auto auth_type = query.auth_data->getType(); - if (auth_type == AuthenticationType::NO_PASSWORD || auth_type == AuthenticationType::LDAP - || auth_type == AuthenticationType::KERBEROS || auth_type == AuthenticationType::SSL_CERTIFICATE) - return; /// No password, nothing to hide. - - if constexpr (check_only) - { - data.can_contain_password = true; - return; - } - - query.show_password = false; - data.password_was_hidden = true; - } - - static void visitCreateQuery(ASTCreateQuery & query, Data & data) - { - if (query.is_dictionary) - data.is_create_dictionary_query = true; - else if (query.table) - data.is_create_table_query = true; - else if (query.database) - data.is_create_database_query = true; - } - - static void visitTableEngine(ASTStorage & storage, Data & data) - { - if (!storage.engine) - return; - - const String & engine_name = storage.engine->name; - - if (engine_name == "ExternalDistributed") - { - /// ExternalDistributed('engine', 'host:port', 'database', 'table', 'user', 'password') - wipePasswordFromArgument(*storage.engine, data, 5); - } - else if (engine_name == "MySQL") - { - /// MySQL('host:port', 'database', 'table', 'user', 'password', ...) - wipePasswordFromArgument(*storage.engine, data, 4); - } - else if (engine_name == "PostgreSQL") - { - /// PostgreSQL('host:port', 'database', 'table', 'user', 'password', ...) - wipePasswordFromArgument(*storage.engine, data, 4); - } - else if (engine_name == "MaterializedPostgreSQL") - { - /// MaterializedPostgreSQL('host:port', 'database', 'table', 'user', 'password', ...) - wipePasswordFromArgument(*storage.engine, data, 4); - } - else if (engine_name == "MongoDB") - { - /// MongoDB('host:port', 'database', 'collection', 'user', 'password', ...) - wipePasswordFromArgument(*storage.engine, data, 4); - } - else if (engine_name == "S3" || engine_name == "COSN" || engine_name == "OSS") - { - /// S3('url', ['aws_access_key_id', 'aws_secret_access_key',] ...) - wipePasswordFromS3TableEngineArguments(*storage.engine, data); - } - } - - static void wipePasswordFromS3TableEngineArguments(ASTFunction & engine, Data & data) - { - /// We replace 'aws_secret_access_key' with '[HIDDEN'] for the following signatures: - /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format') - /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format', 'compression') - - /// But we should check the number of arguments first because we don't need to do that replacements in case of - /// S3('url' [, 'format' [, 'compression']]) - size_t num_arguments; - if (!tryGetNumArguments(engine, &num_arguments) || (num_arguments < 4)) - return; - - wipePasswordFromArgument(engine, data, 2); - } - - static void visitDatabaseEngine(ASTStorage & storage, Data & data) - { - if (!storage.engine) - return; - - const String & engine_name = storage.engine->name; - - if (engine_name == "MySQL" || engine_name == "MaterializeMySQL" || engine_name == "MaterializedMySQL") - { - /// MySQL('host:port', 'database', 'user', 'password') - wipePasswordFromArgument(*storage.engine, data, 3); - } - else if (engine_name == "PostgreSQL" || engine_name == "MaterializedPostgreSQL") - { - /// PostgreSQL('host:port', 'database', 'user', 'password', ...) - wipePasswordFromArgument(*storage.engine, data, 3); - } - } - - static void visitFunction(ASTFunction & function, Data & data) - { - if (function.name == "mysql") - { - /// mysql('host:port', 'database', 'table', 'user', 'password', ...) - wipePasswordFromArgument(function, data, 4); - } - else if (function.name == "postgresql") - { - /// postgresql('host:port', 'database', 'table', 'user', 'password', ...) - wipePasswordFromArgument(function, data, 4); - } - else if (function.name == "mongodb") - { - /// mongodb('host:port', 'database', 'collection', 'user', 'password', ...) - wipePasswordFromArgument(function, data, 4); - } - else if (function.name == "s3" || function.name == "cosn" || function.name == "oss") - { - /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) - wipePasswordFromS3FunctionArguments(function, data, /* is_cluster_function= */ false); - } - else if (function.name == "s3Cluster") - { - /// s3Cluster('cluster_name', 'url', 'aws_access_key_id', 'aws_secret_access_key', ...) - wipePasswordFromS3FunctionArguments(function, data, /* is_cluster_function= */ true); - } - else if (function.name == "remote" || function.name == "remoteSecure") - { - /// remote('addresses_expr', 'db', 'table', 'user', 'password', ...) - wipePasswordFromRemoteFunctionArguments(function, data); - } - else if ( - function.name == "encrypt" || function.name == "decrypt" || function.name == "aes_encrypt_mysql" - || function.name == "aes_decrypt_mysql" || function.name == "tryDecrypt") - { - /// encrypt('mode', 'plaintext', 'key' [, iv, aad]) - wipePasswordFromEncryptionFunctionArguments(function, data); - } - } - - static void wipePasswordFromS3FunctionArguments(ASTFunction & function, Data & data, bool is_cluster_function) - { - /// s3Cluster('cluster_name', 'url', ...) has 'url' as its second argument. - size_t url_arg_idx = is_cluster_function ? 1 : 0; - - /// We're going to replace 'aws_secret_access_key' with '[HIDDEN'] for the following signatures: - /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) - /// s3Cluster('cluster_name', 'url', 'aws_access_key_id', 'aws_secret_access_key', 'format', 'compression') - - /// But we should check the number of arguments first because we don't need to do any replacements in case of - /// s3('url' [, 'format']) or s3Cluster('cluster_name', 'url' [, 'format']) - size_t num_arguments; - if (!tryGetNumArguments(function, &num_arguments) || (num_arguments < url_arg_idx + 3)) - return; - - if (num_arguments >= url_arg_idx + 5) - { - /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format', 'structure', ...) - wipePasswordFromArgument(function, data, url_arg_idx + 2); - } - else - { - /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) - /// We need to distinguish that from s3('url', 'format', 'structure' [, 'compression_method']). - /// So we will check whether the argument after 'url' is a format. - String format; - if (!tryGetStringFromArgument(function, url_arg_idx + 1, &format, /* allow_literal= */ true, /* allow_identifier= */ false)) - { - /// We couldn't evaluate the argument after 'url' so we don't know whether it is a format or `aws_access_key_id`. - /// So it's safer to wipe the next argument just in case. - wipePasswordFromArgument(function, data, url_arg_idx + 2); /// Wipe either `aws_secret_access_key` or `structure`. - return; - } - - if (KnownFormatNames::instance().exists(format)) - return; /// The argument after 'url' is a format: s3('url', 'format', ...) - - /// The argument after 'url' is not a format so we do our replacement: - /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) -> s3('url', 'aws_access_key_id', '[HIDDEN]', ...) - wipePasswordFromArgument(function, data, url_arg_idx + 2); - } - } - - static void wipePasswordFromRemoteFunctionArguments(ASTFunction & function, Data & data) - { - /// We're going to replace 'password' with '[HIDDEN'] for the following signatures: - /// remote('addresses_expr', db.table, 'user' [, 'password'] [, sharding_key]) - /// remote('addresses_expr', 'db', 'table', 'user' [, 'password'] [, sharding_key]) - /// remote('addresses_expr', table_function(), 'user' [, 'password'] [, sharding_key]) - - /// But we should check the number of arguments first because we don't need to do any replacements in case of - /// remote('addresses_expr', db.table) - size_t num_arguments; - if (!tryGetNumArguments(function, &num_arguments) || (num_arguments < 3)) - return; - - auto & arguments = assert_cast(*function.arguments).children; - size_t arg_num = 1; - - /// Skip 1 or 2 arguments with table_function() or db.table or 'db', 'table'. - const auto * table_function = arguments[arg_num]->as(); - if (table_function && KnownTableFunctionNames::instance().exists(table_function->name)) - { - ++arg_num; - } - else - { - String database; - if (!tryGetStringFromArgument(function, arg_num, &database, /* allow_literal= */ true, /* allow_identifier= */ true)) - { - /// We couldn't evaluate the argument so we don't know whether it is 'db.table' or just 'db'. - /// Hence we can't figure out whether we should skip one argument 'user' or two arguments 'table', 'user' - /// before the argument 'password'. So it's safer to wipe two arguments just in case. - /// The last argument can be also a `sharding_key`, so we need to check that argument is a literal string - /// before wiping it (because the `password` argument is always a literal string). - if (tryGetStringFromArgument(function, arg_num + 2, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false)) - wipePasswordFromArgument(function, data, arg_num + 2); /// Wipe either `password` or `user`. - if (tryGetStringFromArgument(function, arg_num + 3, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false)) - wipePasswordFromArgument(function, data, arg_num + 3); /// Wipe either `password` or `sharding_key`. - return; - } - - ++arg_num; - auto qualified_name = QualifiedTableName::parseFromString(database); - if (qualified_name.database.empty()) - ++arg_num; /// skip 'table' argument - } - - /// Skip username. - ++arg_num; - - /// Do our replacement: - /// remote('addresses_expr', db.table, 'user', 'password', ...) -> remote('addresses_expr', db.table, 'user', '[HIDDEN]', ...) - /// The last argument can be also a `sharding_key`, so we need to check that argument is a literal string - /// before wiping it (because the `password` argument is always a literal string). - bool can_be_password = tryGetStringFromArgument(function, arg_num, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false); - if (can_be_password) - wipePasswordFromArgument(function, data, arg_num); - } - - static void wipePasswordFromEncryptionFunctionArguments(ASTFunction & function, Data & data) - { - /// We replace all arguments after 'mode' with '[HIDDEN]': - /// encrypt('mode', 'plaintext', 'key' [, iv, aad]) -> encrypt('mode', '[HIDDEN]') - wipePasswordFromArgument(function, data, 1); - removeArgumentsAfter(function, data, 2); - } - - static void visitBackupQuery(ASTBackupQuery & query, Data & data) - { - if (query.backup_name) - { - if (auto * backup_engine = query.backup_name->as()) - wipePasswordFromBackupEngineArguments(*backup_engine, data); - } - - if (query.base_backup_name) - { - if (auto * base_backup_engine = query.base_backup_name->as()) - wipePasswordFromBackupEngineArguments(*base_backup_engine, data); - } - } - - static void wipePasswordFromBackupEngineArguments(ASTFunction & engine, Data & data) - { - if (engine.name == "S3") - { - /// BACKUP ... TO S3(url, [aws_access_key_id, aws_secret_access_key]) - wipePasswordFromArgument(engine, data, 2); - } - } - - static void wipePasswordFromArgument(ASTFunction & function, Data & data, size_t arg_idx) - { - if (!function.arguments) - return; - - auto * expr_list = function.arguments->as(); - if (!expr_list) - return; /// return because we don't want to validate query here - - auto & arguments = expr_list->children; - if (arg_idx >= arguments.size()) - return; - - if constexpr (check_only) - { - data.can_contain_password = true; - return; - } - - arguments[arg_idx] = std::make_shared("[HIDDEN]"); - data.password_was_hidden = true; - } - - static void removeArgumentsAfter(ASTFunction & function, Data & data, size_t new_num_arguments) - { - if (!function.arguments) - return; - - auto * expr_list = function.arguments->as(); - if (!expr_list) - return; /// return because we don't want to validate query here - - auto & arguments = expr_list->children; - if (new_num_arguments >= arguments.size()) - return; - - if constexpr (check_only) - { - data.can_contain_password = true; - return; - } - - arguments.resize(new_num_arguments); - data.password_was_hidden = true; - } - - static bool tryGetNumArguments(const ASTFunction & function, size_t * res) - { - if (!function.arguments) - return false; - - auto * expr_list = function.arguments->as(); - if (!expr_list) - return false; /// return false because we don't want to validate query here - - const auto & arguments = expr_list->children; - if (res) - *res = arguments.size(); - return true; - } - - static bool - tryGetStringFromArgument(const ASTFunction & function, size_t arg_idx, String * res, bool allow_literal, bool allow_identifier) - { - if (!function.arguments) - return false; - - const auto * expr_list = function.arguments->as(); - if (!expr_list) - return false; /// return false because we don't want to validate query here - - const auto & arguments = expr_list->children; - if (arg_idx >= arguments.size()) - return false; - - ASTPtr argument = arguments[arg_idx]; - if (allow_literal) - { - if (const auto * literal = argument->as()) - { - if (literal->value.getType() != Field::Types::String) - return false; - if (res) - *res = literal->value.safeGet(); - return true; - } - } - - if (allow_identifier) - { - if (const auto * id = argument->as()) - { - if (res) - *res = id->name(); - return true; - } - } - - return false; - } - - static void visitDictionaryDef(ASTDictionary & dictionary, Data & data) - { - if (!dictionary.source || !dictionary.source->elements) - return; - - const auto * expr_list = dictionary.source->elements->as(); - if (!expr_list) - return; /// return because we don't want to validate query here - - const auto & elements = expr_list->children; - - /// We replace password in the dictionary's definition: - /// SOURCE(CLICKHOUSE(host 'example01-01-1' port 9000 user 'default' password 'qwe123' db 'default' table 'ids')) -> - /// SOURCE(CLICKHOUSE(host 'example01-01-1' port 9000 user 'default' password '[HIDDEN]' db 'default' table 'ids')) - for (const auto & element : elements) - { - auto * pair = element->as(); - if (!pair) - continue; /// just skip because we don't want to validate query here - - if (pair->first == "password") - { - if constexpr (check_only) - { - data.can_contain_password = true; - return; - } - pair->set(pair->second, std::make_shared("[HIDDEN]")); - data.password_was_hidden = true; - } - } - } - }; - - /// Checks the type of a specified AST and returns true if it can contain a password. - bool canContainPassword(const IAST & ast, PasswordWipingMode mode) - { - using WipingVisitor = PasswordWipingVisitor; - WipingVisitor::Data data; - data.mode = mode; - WipingVisitor::Visitor visitor{data}; - ASTPtr ast_ptr = std::const_pointer_cast(ast.shared_from_this()); - visitor.visit(ast_ptr); - return data.can_contain_password; - } - - /// Removes a password or its hash from a query if it's specified there or replaces it with some placeholder. - /// This function is used to prepare a query for storing in logs (we don't want logs to contain sensitive information). - bool wipePasswordFromQuery(ASTPtr ast, PasswordWipingMode mode) - { - using WipingVisitor = PasswordWipingVisitor; - WipingVisitor::Data data; - data.mode = mode; - WipingVisitor::Visitor visitor{data}; - visitor.visit(ast); - return data.password_was_hidden; - } - - /// Common utility for masking sensitive information. - String maskSensitiveInfoImpl(const String & query, const ASTPtr & parsed_query, PasswordWipingMode mode) - { - String res = query; - - // Wiping a password or hash from the query because we don't want it to go to logs. - if (parsed_query && canContainPassword(*parsed_query, mode)) - { - ASTPtr ast_without_password = parsed_query->clone(); - if (wipePasswordFromQuery(ast_without_password, mode)) - res = serializeAST(*ast_without_password); - } - - // Wiping sensitive data before cropping query by log_queries_cut_to_length, - // otherwise something like credit card without last digit can go to log. - if (auto * masker = SensitiveDataMasker::getInstance()) - { - auto matches = masker->wipeSensitiveData(res); - if (matches > 0) - { - ProfileEvents::increment(ProfileEvents::QueryMaskingRulesMatch, matches); - } - } - - //res = res.substr(0, context->getSettingsRef().log_queries_cut_to_length); - - return res; - } -} - - -String maskSensitiveInfoInQueryForLogging(const String & query, const ASTPtr & parsed_query) -{ - return maskSensitiveInfoImpl(query, parsed_query, PasswordWipingMode::Query); -} - - -String maskSensitiveInfoInBackupNameForLogging(const String & backup_name, const ASTPtr & ast) -{ - return maskSensitiveInfoImpl(backup_name, ast, PasswordWipingMode::BackupName); -} - -} diff --git a/src/Parsers/maskSensitiveInfoInQueryForLogging.h b/src/Parsers/maskSensitiveInfoInQueryForLogging.h deleted file mode 100644 index 6284b9a9294..00000000000 --- a/src/Parsers/maskSensitiveInfoInQueryForLogging.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include -#include - - -namespace DB -{ - -/// Makes a version of a query without sensitive information (e.g. passwords) for logging. -/// The parameter `parsed query` is allowed to be nullptr if the query cannot be parsed. -/// Does not validate AST, works a best-effort way. -String maskSensitiveInfoInQueryForLogging(const String & query, const ASTPtr & parsed_query); - -/// Makes a version of backup name without sensitive information (e.g. passwords) for logging. -/// Does not validate AST, works a best-effort way. -String maskSensitiveInfoInBackupNameForLogging(const String & backup_name, const ASTPtr & ast); - -} From d365570b27c7ee28dcabece9cedb14d4d04e0085 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 14 Nov 2022 16:37:50 +0100 Subject: [PATCH 41/78] Hide passwords in SHOW CREATE TABLE and system.tables. --- src/Interpreters/InterpreterShowCreateQuery.cpp | 4 +--- src/Storages/System/StorageSystemTables.cpp | 17 ++++++++--------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/Interpreters/InterpreterShowCreateQuery.cpp b/src/Interpreters/InterpreterShowCreateQuery.cpp index 6ae51c1ff23..6853b3c1be1 100644 --- a/src/Interpreters/InterpreterShowCreateQuery.cpp +++ b/src/Interpreters/InterpreterShowCreateQuery.cpp @@ -92,9 +92,7 @@ QueryPipeline InterpreterShowCreateQuery::executeImpl() create.to_inner_uuid = UUIDHelpers::Nil; } - WriteBufferFromOwnString buf; - formatAST(*create_query, buf, false, false); - String res = buf.str(); + String res = create_query->formatWithHiddenSecrets(/* max_length= */ 0, /* one_line= */ false, /* hilite= */ false); MutableColumnPtr column = ColumnString::create(); column->insert(res); diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index 83f922850a3..3f47c1ddb8e 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -232,7 +231,7 @@ protected: { auto temp_db = DatabaseCatalog::instance().getDatabaseForTemporaryTables(); ASTPtr ast = temp_db ? temp_db->tryGetCreateTableQuery(table.second->getStorageID().getTableName(), context) : nullptr; - res_columns[res_index++]->insert(ast ? queryToString(ast) : ""); + res_columns[res_index++]->insert(ast ? ast->formatWithHiddenSecrets() : ""); } // engine_full @@ -383,7 +382,7 @@ protected: } if (columns_mask[src_index++]) - res_columns[res_index++]->insert(ast ? queryToString(ast) : ""); + res_columns[res_index++]->insert(ast ? ast->formatWithHiddenSecrets() : ""); if (columns_mask[src_index++]) { @@ -391,7 +390,7 @@ protected: if (ast_create && ast_create->storage) { - engine_full = queryToString(*ast_create->storage); + engine_full = ast_create->storage->formatWithHiddenSecrets(); static const char * const extra_head = " ENGINE = "; if (startsWith(engine_full, extra_head)) @@ -405,7 +404,7 @@ protected: { String as_select; if (ast_create && ast_create->select) - as_select = queryToString(*ast_create->select); + as_select = ast_create->select->formatWithHiddenSecrets(); res_columns[res_index++]->insert(as_select); } } @@ -420,7 +419,7 @@ protected: if (columns_mask[src_index++]) { if (metadata_snapshot && (expression_ptr = metadata_snapshot->getPartitionKeyAST())) - res_columns[res_index++]->insert(queryToString(expression_ptr)); + res_columns[res_index++]->insert(expression_ptr->formatWithHiddenSecrets()); else res_columns[res_index++]->insertDefault(); } @@ -428,7 +427,7 @@ protected: if (columns_mask[src_index++]) { if (metadata_snapshot && (expression_ptr = metadata_snapshot->getSortingKey().expression_list_ast)) - res_columns[res_index++]->insert(queryToString(expression_ptr)); + res_columns[res_index++]->insert(expression_ptr->formatWithHiddenSecrets()); else res_columns[res_index++]->insertDefault(); } @@ -436,7 +435,7 @@ protected: if (columns_mask[src_index++]) { if (metadata_snapshot && (expression_ptr = metadata_snapshot->getPrimaryKey().expression_list_ast)) - res_columns[res_index++]->insert(queryToString(expression_ptr)); + res_columns[res_index++]->insert(expression_ptr->formatWithHiddenSecrets()); else res_columns[res_index++]->insertDefault(); } @@ -444,7 +443,7 @@ protected: if (columns_mask[src_index++]) { if (metadata_snapshot && (expression_ptr = metadata_snapshot->getSamplingKeyAST())) - res_columns[res_index++]->insert(queryToString(expression_ptr)); + res_columns[res_index++]->insert(expression_ptr->formatWithHiddenSecrets()); else res_columns[res_index++]->insertDefault(); } From 2e242fa99104506e92b44f08a0f5d84c34ddd649 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 14 Nov 2022 18:42:46 +0100 Subject: [PATCH 42/78] Add tests. --- .../__init__.py | 0 .../test.py | 43 +++++++++++++++++++ 2 files changed, 43 insertions(+) rename tests/integration/{test_mask_sensitive_info_in_logs => test_mask_sensitive_info}/__init__.py (100%) rename tests/integration/{test_mask_sensitive_info_in_logs => test_mask_sensitive_info}/test.py (88%) diff --git a/tests/integration/test_mask_sensitive_info_in_logs/__init__.py b/tests/integration/test_mask_sensitive_info/__init__.py similarity index 100% rename from tests/integration/test_mask_sensitive_info_in_logs/__init__.py rename to tests/integration/test_mask_sensitive_info/__init__.py diff --git a/tests/integration/test_mask_sensitive_info_in_logs/test.py b/tests/integration/test_mask_sensitive_info/test.py similarity index 88% rename from tests/integration/test_mask_sensitive_info_in_logs/test.py rename to tests/integration/test_mask_sensitive_info/test.py index 48f11fbf7a1..f546c559f66 100644 --- a/tests/integration/test_mask_sensitive_info_in_logs/test.py +++ b/tests/integration/test_mask_sensitive_info/test.py @@ -1,6 +1,7 @@ import pytest import random, string from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV cluster = ClickHouseCluster(__file__) node = cluster.add_instance("node", with_zookeeper=True) @@ -110,6 +111,22 @@ def test_create_table(): for i, table_engine in enumerate(table_engines): node.query(f"CREATE TABLE table{i} (x int) ENGINE = {table_engine}") + assert ( + node.query("SHOW CREATE TABLE table0") + == "CREATE TABLE default.table0\\n(\\n `x` Int32\\n)\\nENGINE = MySQL(\\'mysql57:3306\\', \\'mysql_db\\', \\'mysql_table\\', \\'mysql_user\\', \\'[HIDDEN]\\')\n" + ) + + assert node.query( + "SELECT create_table_query, engine_full FROM system.tables WHERE name = 'table0'" + ) == TSV( + [ + [ + "CREATE TABLE default.table0 (`x` Int32) ENGINE = MySQL(\\'mysql57:3306\\', \\'mysql_db\\', \\'mysql_table\\', \\'mysql_user\\', \\'[HIDDEN]\\')", + "MySQL(\\'mysql57:3306\\', \\'mysql_db\\', \\'mysql_table\\', \\'mysql_user\\', \\'[HIDDEN]\\')", + ], + ] + ) + check_logs( must_contain=[ "CREATE TABLE table0 (`x` int) ENGINE = MySQL('mysql57:3306', 'mysql_db', 'mysql_table', 'mysql_user', '[HIDDEN]')", @@ -189,6 +206,22 @@ def test_table_functions(): for i, table_function in enumerate(table_functions): node.query(f"CREATE TABLE tablefunc{i} (x int) AS {table_function}") + assert ( + node.query("SHOW CREATE TABLE tablefunc0") + == "CREATE TABLE default.tablefunc0\\n(\\n `x` Int32\\n) AS mysql(\\'mysql57:3306\\', \\'mysql_db\\', \\'mysql_table\\', \\'mysql_user\\', \\'[HIDDEN]\\')\n" + ) + + assert node.query( + "SELECT create_table_query, engine_full FROM system.tables WHERE name = 'tablefunc0'" + ) == TSV( + [ + [ + "CREATE TABLE default.tablefunc0 (`x` Int32) AS mysql(\\'mysql57:3306\\', \\'mysql_db\\', \\'mysql_table\\', \\'mysql_user\\', \\'[HIDDEN]\\')", + "", + ], + ] + ) + check_logs( must_contain=[ "CREATE TABLE tablefunc0 (`x` int) AS mysql('mysql57:3306', 'mysql_db', 'mysql_table', 'mysql_user', '[HIDDEN]')", @@ -268,6 +301,16 @@ def test_create_dictionary(): f"LIFETIME(MIN 0 MAX 10) LAYOUT(FLAT())" ) + assert ( + node.query("SHOW CREATE TABLE dict1") + == "CREATE DICTIONARY default.dict1\\n(\\n `n` int DEFAULT 0,\\n `m` int DEFAULT 1\\n)\\nPRIMARY KEY n\\nSOURCE(CLICKHOUSE(HOST \\'localhost\\' PORT 9000 USER \\'user1\\' TABLE \\'test\\' PASSWORD \\'[HIDDEN]\\' DB \\'default\\'))\\nLIFETIME(MIN 0 MAX 10)\\nLAYOUT(FLAT())\n" + ) + + assert ( + node.query("SELECT create_table_query FROM system.tables WHERE name = 'dict1'") + == "CREATE DICTIONARY default.dict1 (`n` int DEFAULT 0, `m` int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST \\'localhost\\' PORT 9000 USER \\'user1\\' TABLE \\'test\\' PASSWORD \\'[HIDDEN]\\' DB \\'default\\')) LIFETIME(MIN 0 MAX 10) LAYOUT(FLAT())\n" + ) + check_logs( must_contain=[ "CREATE DICTIONARY dict1 (`n` int DEFAULT 0, `m` int DEFAULT 1) PRIMARY KEY n " From ce81166c7edab32c99679cf961520aeababdad4d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 14 Nov 2022 19:28:19 +0100 Subject: [PATCH 43/78] Fix style. --- src/Backups/BackupSettings.cpp | 2 +- src/Common/KnownObjectNames.cpp | 4 ++-- src/Common/SensitiveDataMasker.h | 2 +- src/Formats/FormatFactory.cpp | 4 ++-- src/Parsers/ASTFunction.cpp | 12 ++++++------ src/Parsers/ASTFunction.h | 2 +- src/Parsers/ASTFunctionWithKeyValueArguments.h | 2 +- src/Parsers/ParserBackupQuery.cpp | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index 2179a572bcd..bed34f2fbcc 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -2,8 +2,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/src/Common/KnownObjectNames.cpp b/src/Common/KnownObjectNames.cpp index 45066fc93d4..0862c52d892 100644 --- a/src/Common/KnownObjectNames.cpp +++ b/src/Common/KnownObjectNames.cpp @@ -10,8 +10,8 @@ bool KnownObjectNames::exists(const String & name) const std::lock_guard lock{mutex}; if (names.contains(name)) return true; - - if (case_insensitive_names.empty()) + + if (!case_insensitive_names.empty()) { String lower_name = Poco::toLower(name); if (case_insensitive_names.contains(lower_name)) diff --git a/src/Common/SensitiveDataMasker.h b/src/Common/SensitiveDataMasker.h index 031956cb926..de5cc125dcc 100644 --- a/src/Common/SensitiveDataMasker.h +++ b/src/Common/SensitiveDataMasker.h @@ -71,6 +71,6 @@ public: /// Wipes sensitive data and cuts to a specified maximum length in one function call. /// If the maximum length is zero then the function doesn't cut to the maximum length. -std::string wipeSensitiveDataAndCutToLength(const std::string & str, size_t max_length = 0); +std::string wipeSensitiveDataAndCutToLength(const std::string & str, size_t max_length); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index bf1168dd8a7..868e275fcdc 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -445,8 +445,8 @@ void FormatFactory::registerInputFormat(const String & name, InputCreator input_ if (target) throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); target = std::move(input_creator); - KnownFormatNames::instance().add(name); registerFileExtension(name, name); + KnownFormatNames::instance().add(name); } void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker) @@ -455,7 +455,6 @@ void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name if (target) throw Exception("FormatFactory: Non trivial prefix and suffix checker " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); target = std::move(non_trivial_prefix_and_suffix_checker); - KnownFormatNames::instance().add(name); } void FormatFactory::registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker) @@ -486,6 +485,7 @@ void FormatFactory::registerOutputFormat(const String & name, OutputCreator outp throw Exception("FormatFactory: Output format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); target = std::move(output_creator); registerFileExtension(name, name); + KnownFormatNames::instance().add(name); } void FormatFactory::registerFileExtension(const String & extension, const String & format_name) diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index 3467b8b5ebc..8eeb06871a8 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -641,12 +641,6 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format if (arguments->children[i]->as()) settings.ostr << "SETTINGS "; - if ((i == 1) && special_hilite_regexp - && highlightStringLiteralWithMetacharacters(arguments->children[i], settings, "|()^$.[]?*+{:-")) - { - continue; - } - if (!settings.show_secrets && (secret_arguments.first <= i) && (i < secret_arguments.second)) { settings.ostr << "'[HIDDEN]'"; @@ -655,6 +649,12 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format continue; } + if ((i == 1) && special_hilite_regexp + && highlightStringLiteralWithMetacharacters(arguments->children[i], settings, "|()^$.[]?*+{:-")) + { + continue; + } + arguments->children[i]->formatImpl(settings, state, nested_dont_need_parens); } } diff --git a/src/Parsers/ASTFunction.h b/src/Parsers/ASTFunction.h index 052325fcf61..83b06bd26ec 100644 --- a/src/Parsers/ASTFunction.h +++ b/src/Parsers/ASTFunction.h @@ -42,7 +42,7 @@ public: /// do not print empty parentheses if there are no args - compatibility with new AST for data types and engine names. bool no_empty_args = false; - /// Kind of this function-like expression, how it's actually used. + /// Specifies where this function-like expression is used. enum class Kind { ORDINARY_FUNCTION, diff --git a/src/Parsers/ASTFunctionWithKeyValueArguments.h b/src/Parsers/ASTFunctionWithKeyValueArguments.h index e54bc9288f2..67d591dfcdc 100644 --- a/src/Parsers/ASTFunctionWithKeyValueArguments.h +++ b/src/Parsers/ASTFunctionWithKeyValueArguments.h @@ -29,7 +29,7 @@ public: ASTPtr clone() const override; void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; - + bool hasSecretParts() const override; void updateTreeHashImpl(SipHash & hash_state) const override; diff --git a/src/Parsers/ParserBackupQuery.cpp b/src/Parsers/ParserBackupQuery.cpp index 4a8eb2b5802..78bc10511b2 100644 --- a/src/Parsers/ParserBackupQuery.cpp +++ b/src/Parsers/ParserBackupQuery.cpp @@ -210,7 +210,7 @@ namespace { if (!ParserIdentifierWithOptionalParameters{}.parse(pos, backup_name, expected)) return false; - + backup_name->as().kind = ASTFunction::Kind::BACKUP_NAME; return true; } From 4a25fcc056ae4bf03bec114525b1eaced6dec672 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 15 Nov 2022 17:11:26 +0100 Subject: [PATCH 44/78] Fix procesing '' as a database name in remote() --- src/Parsers/findFunctionSecretArguments.cpp | 73 +++++++++++++++------ 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/src/Parsers/findFunctionSecretArguments.cpp b/src/Parsers/findFunctionSecretArguments.cpp index 65c06b7ec97..e40c25bd83b 100644 --- a/src/Parsers/findFunctionSecretArguments.cpp +++ b/src/Parsers/findFunctionSecretArguments.cpp @@ -15,7 +15,7 @@ namespace { constexpr const std::pair npos{static_cast(-1), static_cast(-1)}; - bool tryGetStringFromArgument(const ASTFunction & function, size_t arg_idx, String * res, bool allow_literal, bool allow_identifier) + bool tryGetStringFromArgument(const ASTFunction & function, size_t arg_idx, String * res, bool allow_identifier = true) { if (!function.arguments) return false; @@ -29,16 +29,13 @@ namespace return false; ASTPtr argument = arguments[arg_idx]; - if (allow_literal) + if (const auto * literal = argument->as()) { - if (const auto * literal = argument->as()) - { - if (literal->value.getType() != Field::Types::String) - return false; - if (res) - *res = literal->value.safeGet(); - return true; - } + if (literal->value.getType() != Field::Types::String) + return false; + if (res) + *res = literal->value.safeGet(); + return true; } if (allow_identifier) @@ -86,7 +83,7 @@ namespace /// We need to distinguish that from s3('url', 'format', 'structure' [, 'compression_method']). /// So we will check whether the argument after 'url' is a format. String format; - if (!tryGetStringFromArgument(function, url_arg_idx + 1, &format, /* allow_literal= */ true, /* allow_identifier= */ false)) + if (!tryGetStringFromArgument(function, url_arg_idx + 1, &format, /* allow_identifier= */ false)) { /// We couldn't evaluate the argument after 'url' so we don't know whether it is a format or `aws_access_key_id`. /// So it's safer to wipe the next argument just in case. @@ -103,6 +100,40 @@ namespace } + /// Tries to get either a database name or a qualified table name from an argument. + /// Empty string is also allowed (it means the default database). + /// The function is used by findRemoteFunctionSecretArguments() to determine how many arguments to skip before a password. + bool tryGetDatabaseNameOrQualifiedTableName( + const ASTFunction & function, + size_t arg_idx, + std::optional & res_database, + std::optional & res_qualified_table_name) + { + res_database.reset(); + res_qualified_table_name.reset(); + + String str; + if (!tryGetStringFromArgument(function, arg_idx, &str, /* allow_identifier= */ true)) + return false; + + if (str.empty()) + { + res_database = ""; + return true; + } + + auto qualified_table_name = QualifiedTableName::tryParseFromString(str); + if (!qualified_table_name) + return false; + + if (qualified_table_name->database.empty()) + res_database = std::move(qualified_table_name->table); + else + res_qualified_table_name = std::move(qualified_table_name); + return true; + } + + std::pair findRemoteFunctionSecretArguments(const ASTFunction & function) { const auto * expr_list = function.arguments->as(); @@ -131,8 +162,9 @@ namespace } else { - String database; - if (!tryGetStringFromArgument(function, arg_num, &database, /* allow_literal= */ true, /* allow_identifier= */ true)) + std::optional database; + std::optional qualified_table_name; + if (!tryGetDatabaseNameOrQualifiedTableName(function, arg_num, database, qualified_table_name)) { /// We couldn't evaluate the argument so we don't know whether it is 'db.table' or just 'db'. /// Hence we can't figure out whether we should skip one argument 'user' or two arguments 'table', 'user' @@ -140,12 +172,12 @@ namespace /// The last argument can be also a `sharding_key`, so we need to check that argument is a literal string /// before wiping it (because the `password` argument is always a literal string). auto res = npos; - if (tryGetStringFromArgument(function, arg_num + 2, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false)) + if (tryGetStringFromArgument(function, arg_num + 2, nullptr, /* allow_identifier= */ false)) { /// Wipe either `password` or `user`. res = {arg_num + 2, arg_num + 3}; } - if (tryGetStringFromArgument(function, arg_num + 3, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false)) + if (tryGetStringFromArgument(function, arg_num + 3, nullptr, /* allow_identifier= */ false)) { /// Wipe either `password` or `sharding_key`. if (res == npos) @@ -155,10 +187,13 @@ namespace return res; } + /// Skip the current argument (which is either a database name or a qualified table name). ++arg_num; - auto qualified_name = QualifiedTableName::parseFromString(database); - if (qualified_name.database.empty()) - ++arg_num; /// skip 'table' argument + if (database) + { + /// Skip the 'table' argument if the previous argument was a database name. + ++arg_num; + } } /// Skip username. @@ -168,7 +203,7 @@ namespace /// remote('addresses_expr', db.table, 'user', 'password', ...) -> remote('addresses_expr', db.table, 'user', '[HIDDEN]', ...) /// The last argument can be also a `sharding_key`, so we need to check that argument is a literal string /// before wiping it (because the `password` argument is always a literal string). - bool can_be_password = tryGetStringFromArgument(function, arg_num, nullptr, /* allow_literal= */ true, /* allow_identifier= */ false); + bool can_be_password = tryGetStringFromArgument(function, arg_num, nullptr, /* allow_identifier= */ false); if (can_be_password) return {arg_num, arg_num + 1}; From 63be12adca23c5b227bf85a330143263c2316550 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 15 Nov 2022 17:11:35 +0100 Subject: [PATCH 45/78] Fix tests. --- .../0_stateless/01018_ddl_dictionaries_create.reference | 4 ++-- tests/queries/0_stateless/01190_full_attach_syntax.reference | 4 ++-- .../0_stateless/01224_no_superfluous_dict_reload.reference | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/01018_ddl_dictionaries_create.reference b/tests/queries/0_stateless/01018_ddl_dictionaries_create.reference index 9b130f11df6..450eaee4b2f 100644 --- a/tests/queries/0_stateless/01018_ddl_dictionaries_create.reference +++ b/tests/queries/0_stateless/01018_ddl_dictionaries_create.reference @@ -1,5 +1,5 @@ =DICTIONARY in Ordinary DB -CREATE DICTIONARY db_01018.dict1\n(\n `key_column` UInt64 DEFAULT 0,\n `second_column` UInt8 DEFAULT 1,\n `third_column` String DEFAULT \'qqq\'\n)\nPRIMARY KEY key_column\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'table_for_dict\' PASSWORD \'\' DB \'database_for_dict_01018\'))\nLIFETIME(MIN 1 MAX 10)\nLAYOUT(FLAT()) +CREATE DICTIONARY db_01018.dict1\n(\n `key_column` UInt64 DEFAULT 0,\n `second_column` UInt8 DEFAULT 1,\n `third_column` String DEFAULT \'qqq\'\n)\nPRIMARY KEY key_column\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'table_for_dict\' PASSWORD \'[HIDDEN]\' DB \'database_for_dict_01018\'))\nLIFETIME(MIN 1 MAX 10)\nLAYOUT(FLAT()) dict1 1 db_01018 dict1 @@ -12,7 +12,7 @@ db_01018 dict1 ==DROP DICTIONARY 0 =DICTIONARY in Memory DB -CREATE DICTIONARY memory_db.dict2\n(\n `key_column` UInt64 DEFAULT 0 INJECTIVE,\n `second_column` UInt8 DEFAULT 1 EXPRESSION rand() % 222,\n `third_column` String DEFAULT \'qqq\'\n)\nPRIMARY KEY key_column\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'table_for_dict\' PASSWORD \'\' DB \'database_for_dict_01018\'))\nLIFETIME(MIN 1 MAX 10)\nLAYOUT(FLAT()) +CREATE DICTIONARY memory_db.dict2\n(\n `key_column` UInt64 DEFAULT 0 INJECTIVE,\n `second_column` UInt8 DEFAULT 1 EXPRESSION rand() % 222,\n `third_column` String DEFAULT \'qqq\'\n)\nPRIMARY KEY key_column\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'table_for_dict\' PASSWORD \'[HIDDEN]\' DB \'database_for_dict_01018\'))\nLIFETIME(MIN 1 MAX 10)\nLAYOUT(FLAT()) dict2 1 memory_db dict2 diff --git a/tests/queries/0_stateless/01190_full_attach_syntax.reference b/tests/queries/0_stateless/01190_full_attach_syntax.reference index f924c2ec780..61a0603975a 100644 --- a/tests/queries/0_stateless/01190_full_attach_syntax.reference +++ b/tests/queries/0_stateless/01190_full_attach_syntax.reference @@ -1,5 +1,5 @@ -CREATE DICTIONARY test_01190.dict\n(\n `key` UInt64 DEFAULT 0,\n `col` UInt8 DEFAULT 1\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'table_for_dict\' PASSWORD \'\' DB \'test_01190\'))\nLIFETIME(MIN 1 MAX 10)\nLAYOUT(FLAT()) -CREATE DICTIONARY test_01190.dict\n(\n `key` UInt64 DEFAULT 0,\n `col` UInt8 DEFAULT 1\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'table_for_dict\' PASSWORD \'\' DB \'test_01190\'))\nLIFETIME(MIN 1 MAX 10)\nLAYOUT(FLAT()) +CREATE DICTIONARY test_01190.dict\n(\n `key` UInt64 DEFAULT 0,\n `col` UInt8 DEFAULT 1\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'table_for_dict\' PASSWORD \'[HIDDEN]\' DB \'test_01190\'))\nLIFETIME(MIN 1 MAX 10)\nLAYOUT(FLAT()) +CREATE DICTIONARY test_01190.dict\n(\n `key` UInt64 DEFAULT 0,\n `col` UInt8 DEFAULT 1\n)\nPRIMARY KEY key\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'table_for_dict\' PASSWORD \'[HIDDEN]\' DB \'test_01190\'))\nLIFETIME(MIN 1 MAX 10)\nLAYOUT(FLAT()) CREATE TABLE test_01190.log\n(\n `s` String\n)\nENGINE = Log CREATE TABLE test_01190.log\n(\n `s` String\n)\nENGINE = Log test diff --git a/tests/queries/0_stateless/01224_no_superfluous_dict_reload.reference b/tests/queries/0_stateless/01224_no_superfluous_dict_reload.reference index d80501b3f4d..e5b8ffd7732 100644 --- a/tests/queries/0_stateless/01224_no_superfluous_dict_reload.reference +++ b/tests/queries/0_stateless/01224_no_superfluous_dict_reload.reference @@ -6,7 +6,7 @@ CREATE DICTIONARY dict_db_01224.dict `val` UInt64 DEFAULT 10 ) PRIMARY KEY key -SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'dict_data' PASSWORD '' DB 'dict_db_01224')) +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'dict_data' PASSWORD '[HIDDEN]' DB 'dict_db_01224')) LIFETIME(MIN 0 MAX 0) LAYOUT(FLAT()) NOT_LOADED @@ -17,7 +17,7 @@ CREATE TABLE dict_db_01224_dictionary.`dict_db_01224.dict` ) ENGINE = Dictionary(`dict_db_01224.dict`) NOT_LOADED -Dictionary 1 CREATE DICTIONARY dict_db_01224.dict (`key` UInt64 DEFAULT 0, `val` UInt64 DEFAULT 10) PRIMARY KEY key SOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'dict_data\' PASSWORD \'\' DB \'dict_db_01224\')) LIFETIME(MIN 0 MAX 0) LAYOUT(FLAT()) +Dictionary 1 CREATE DICTIONARY dict_db_01224.dict (`key` UInt64 DEFAULT 0, `val` UInt64 DEFAULT 10) PRIMARY KEY key SOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' TABLE \'dict_data\' PASSWORD \'[HIDDEN]\' DB \'dict_db_01224\')) LIFETIME(MIN 0 MAX 0) LAYOUT(FLAT()) NOT_LOADED key UInt64 val UInt64 From 11c28abbed324a0f9b27fdd82e939402c93a4aa8 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Wed, 16 Nov 2022 00:46:57 +0000 Subject: [PATCH 46/78] fix race in IColumn::dumpStructure --- src/Columns/ColumnAggregateFunction.cpp | 6 ++--- src/Columns/ColumnArray.h | 8 +++--- src/Columns/ColumnConst.h | 6 ++--- src/Columns/ColumnLowCardinality.h | 9 ++++--- src/Columns/ColumnMap.cpp | 6 ++--- src/Columns/ColumnMap.h | 4 +-- src/Columns/ColumnNullable.h | 8 +++--- src/Columns/ColumnObject.cpp | 8 +++--- src/Columns/ColumnObject.h | 4 +-- src/Columns/ColumnSparse.cpp | 8 +++--- src/Columns/ColumnSparse.h | 4 +-- src/Columns/ColumnTuple.cpp | 6 ++--- src/Columns/ColumnTuple.h | 4 +-- src/Columns/ColumnUnique.h | 18 ++++++++++--- src/Columns/IColumn.cpp | 22 ++++++++++++--- src/Columns/IColumn.h | 17 +++++++++--- .../tests/gtest_column_dump_structure.cpp | 27 +++++++++++++++++++ src/Interpreters/AggregationUtils.cpp | 7 ++--- .../02482_insert_into_dist_race.reference | 0 .../02482_insert_into_dist_race.sql | 26 ++++++++++++++++++ 20 files changed, 145 insertions(+), 53 deletions(-) create mode 100644 src/Columns/tests/gtest_column_dump_structure.cpp create mode 100644 tests/queries/0_stateless/02482_insert_into_dist_race.reference create mode 100644 tests/queries/0_stateless/02482_insert_into_dist_race.sql diff --git a/src/Columns/ColumnAggregateFunction.cpp b/src/Columns/ColumnAggregateFunction.cpp index b51b21307f3..f51a0426199 100644 --- a/src/Columns/ColumnAggregateFunction.cpp +++ b/src/Columns/ColumnAggregateFunction.cpp @@ -152,16 +152,16 @@ MutableColumnPtr ColumnAggregateFunction::convertToValues(MutableColumnPtr colum /// If there are references to states in final column, we must hold their ownership /// by holding arenas and source. - auto callback = [&](auto & subcolumn) + auto callback = [&](IColumn & subcolumn) { - if (auto * aggregate_subcolumn = typeid_cast(subcolumn.get())) + if (auto * aggregate_subcolumn = typeid_cast(&subcolumn)) { aggregate_subcolumn->foreign_arenas = concatArenas(column_aggregate_func.foreign_arenas, column_aggregate_func.my_arena); aggregate_subcolumn->src = column_aggregate_func.getPtr(); } }; - callback(res); + callback(*res); res->forEachSubcolumnRecursively(callback); for (auto * val : data) diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index 5970802f598..44652fd0c4b 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -151,17 +151,17 @@ public: ColumnPtr compress() const override; - void forEachSubcolumn(ColumnCallback callback) override + void forEachSubcolumn(ColumnCallback callback) const override { callback(offsets); callback(data); } - void forEachSubcolumnRecursively(ColumnCallback callback) override + void forEachSubcolumnRecursively(RecursiveColumnCallback callback) const override { - callback(offsets); + callback(*offsets); offsets->forEachSubcolumnRecursively(callback); - callback(data); + callback(*data); data->forEachSubcolumnRecursively(callback); } diff --git a/src/Columns/ColumnConst.h b/src/Columns/ColumnConst.h index e1f335460fd..0c9245885c4 100644 --- a/src/Columns/ColumnConst.h +++ b/src/Columns/ColumnConst.h @@ -230,14 +230,14 @@ public: data->getExtremes(min, max); } - void forEachSubcolumn(ColumnCallback callback) override + void forEachSubcolumn(ColumnCallback callback) const override { callback(data); } - void forEachSubcolumnRecursively(ColumnCallback callback) override + void forEachSubcolumnRecursively(RecursiveColumnCallback callback) const override { - callback(data); + callback(*data); data->forEachSubcolumnRecursively(callback); } diff --git a/src/Columns/ColumnLowCardinality.h b/src/Columns/ColumnLowCardinality.h index 20bc45eb569..e7310e70fcb 100644 --- a/src/Columns/ColumnLowCardinality.h +++ b/src/Columns/ColumnLowCardinality.h @@ -164,7 +164,7 @@ public: size_t byteSizeAt(size_t n) const override { return getDictionary().byteSizeAt(getIndexes().getUInt(n)); } size_t allocatedBytes() const override { return idx.getPositions()->allocatedBytes() + getDictionary().allocatedBytes(); } - void forEachSubcolumn(ColumnCallback callback) override + void forEachSubcolumn(ColumnCallback callback) const override { callback(idx.getPositionsPtr()); @@ -173,15 +173,15 @@ public: callback(dictionary.getColumnUniquePtr()); } - void forEachSubcolumnRecursively(ColumnCallback callback) override + void forEachSubcolumnRecursively(RecursiveColumnCallback callback) const override { - callback(idx.getPositionsPtr()); + callback(*idx.getPositionsPtr()); idx.getPositionsPtr()->forEachSubcolumnRecursively(callback); /// Column doesn't own dictionary if it's shared. if (!dictionary.isShared()) { - callback(dictionary.getColumnUniquePtr()); + callback(*dictionary.getColumnUniquePtr()); dictionary.getColumnUniquePtr()->forEachSubcolumnRecursively(callback); } } @@ -278,6 +278,7 @@ public: const ColumnPtr & getPositions() const { return positions; } WrappedPtr & getPositionsPtr() { return positions; } + const WrappedPtr & getPositionsPtr() const { return positions; } size_t getPositionAt(size_t row) const; void insertPosition(UInt64 position); void insertPositionsRange(const IColumn & column, UInt64 offset, UInt64 limit); diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index 7377707bdb2..0fe6d4c361b 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -273,14 +273,14 @@ void ColumnMap::getExtremes(Field & min, Field & max) const max = std::move(map_max_value); } -void ColumnMap::forEachSubcolumn(ColumnCallback callback) +void ColumnMap::forEachSubcolumn(ColumnCallback callback) const { callback(nested); } -void ColumnMap::forEachSubcolumnRecursively(ColumnCallback callback) +void ColumnMap::forEachSubcolumnRecursively(RecursiveColumnCallback callback) const { - callback(nested); + callback(*nested); nested->forEachSubcolumnRecursively(callback); } diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index 1e03633ced7..db918c3db50 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -88,8 +88,8 @@ public: size_t byteSizeAt(size_t n) const override; size_t allocatedBytes() const override; void protect() override; - void forEachSubcolumn(ColumnCallback callback) override; - void forEachSubcolumnRecursively(ColumnCallback callback) override; + void forEachSubcolumn(ColumnCallback callback) const override; + void forEachSubcolumnRecursively(RecursiveColumnCallback callback) const override; bool structureEquals(const IColumn & rhs) const override; double getRatioOfDefaultRows(double sample_ratio) const override; void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 2d15442e583..85bf095a9d1 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -130,17 +130,17 @@ public: ColumnPtr compress() const override; - void forEachSubcolumn(ColumnCallback callback) override + void forEachSubcolumn(ColumnCallback callback) const override { callback(nested_column); callback(null_map); } - void forEachSubcolumnRecursively(ColumnCallback callback) override + void forEachSubcolumnRecursively(RecursiveColumnCallback callback) const override { - callback(nested_column); + callback(*nested_column); nested_column->forEachSubcolumnRecursively(callback); - callback(null_map); + callback(*null_map); null_map->forEachSubcolumnRecursively(callback); } diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index bf4630137d5..c3e8af5a8cf 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -664,20 +664,20 @@ size_t ColumnObject::allocatedBytes() const return res; } -void ColumnObject::forEachSubcolumn(ColumnCallback callback) +void ColumnObject::forEachSubcolumn(ColumnCallback callback) const { for (auto & entry : subcolumns) for (auto & part : entry->data.data) callback(part); } -void ColumnObject::forEachSubcolumnRecursively(ColumnCallback callback) +void ColumnObject::forEachSubcolumnRecursively(RecursiveColumnCallback callback) const { for (auto & entry : subcolumns) { - for (auto & part : entry->data.data) + for (const auto & part : entry->data.data) { - callback(part); + callback(*part); part->forEachSubcolumnRecursively(callback); } } diff --git a/src/Columns/ColumnObject.h b/src/Columns/ColumnObject.h index 8fcf3d41fba..aa31cf54413 100644 --- a/src/Columns/ColumnObject.h +++ b/src/Columns/ColumnObject.h @@ -206,8 +206,8 @@ public: size_t size() const override; size_t byteSize() const override; size_t allocatedBytes() const override; - void forEachSubcolumn(ColumnCallback callback) override; - void forEachSubcolumnRecursively(ColumnCallback callback) override; + void forEachSubcolumn(ColumnCallback callback) const override; + void forEachSubcolumnRecursively(RecursiveColumnCallback callback) const override; void insert(const Field & field) override; void insertDefault() override; void insertFrom(const IColumn & src, size_t n) override; diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index 0c3f8d11adc..0f44d2994fb 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -744,17 +744,17 @@ bool ColumnSparse::structureEquals(const IColumn & rhs) const return false; } -void ColumnSparse::forEachSubcolumn(ColumnCallback callback) +void ColumnSparse::forEachSubcolumn(ColumnCallback callback) const { callback(values); callback(offsets); } -void ColumnSparse::forEachSubcolumnRecursively(ColumnCallback callback) +void ColumnSparse::forEachSubcolumnRecursively(RecursiveColumnCallback callback) const { - callback(values); + callback(*values); values->forEachSubcolumnRecursively(callback); - callback(offsets); + callback(*offsets); offsets->forEachSubcolumnRecursively(callback); } diff --git a/src/Columns/ColumnSparse.h b/src/Columns/ColumnSparse.h index 5814cd77637..ffe392a83c1 100644 --- a/src/Columns/ColumnSparse.h +++ b/src/Columns/ColumnSparse.h @@ -139,8 +139,8 @@ public: ColumnPtr compress() const override; - void forEachSubcolumn(ColumnCallback callback) override; - void forEachSubcolumnRecursively(ColumnCallback callback) override; + void forEachSubcolumn(ColumnCallback callback) const override; + void forEachSubcolumnRecursively(RecursiveColumnCallback callback) const override; bool structureEquals(const IColumn & rhs) const override; diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index d8a43bf510d..c8ac2c67c36 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -495,17 +495,17 @@ void ColumnTuple::getExtremes(Field & min, Field & max) const max = max_tuple; } -void ColumnTuple::forEachSubcolumn(ColumnCallback callback) +void ColumnTuple::forEachSubcolumn(ColumnCallback callback) const { for (auto & column : columns) callback(column); } -void ColumnTuple::forEachSubcolumnRecursively(ColumnCallback callback) +void ColumnTuple::forEachSubcolumnRecursively(RecursiveColumnCallback callback) const { for (auto & column : columns) { - callback(column); + callback(*column); column->forEachSubcolumnRecursively(callback); } } diff --git a/src/Columns/ColumnTuple.h b/src/Columns/ColumnTuple.h index 96395d4edfb..25f6328b3fc 100644 --- a/src/Columns/ColumnTuple.h +++ b/src/Columns/ColumnTuple.h @@ -96,8 +96,8 @@ public: size_t byteSizeAt(size_t n) const override; size_t allocatedBytes() const override; void protect() override; - void forEachSubcolumn(ColumnCallback callback) override; - void forEachSubcolumnRecursively(ColumnCallback callback) override; + void forEachSubcolumn(ColumnCallback callback) const override; + void forEachSubcolumnRecursively(RecursiveColumnCallback callback) const override; bool structureEquals(const IColumn & rhs) const override; bool isCollationSupported() const override; ColumnPtr compress() const override; diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index a1579a46ae0..27faf4bd2ad 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -105,7 +105,13 @@ public: return column_holder->allocatedBytes() + reverse_index.allocatedBytes() + (nested_null_mask ? nested_null_mask->allocatedBytes() : 0); } - void forEachSubcolumn(IColumn::ColumnCallback callback) override + + void forEachSubcolumn(IColumn::ColumnCallback callback) const override + { + callback(column_holder); + } + + void forEachSubcolumn(IColumn::MutableColumnCallback callback) override { callback(column_holder); reverse_index.setColumn(getRawColumnPtr()); @@ -113,9 +119,15 @@ public: nested_column_nullable = ColumnNullable::create(column_holder, nested_null_mask); } - void forEachSubcolumnRecursively(IColumn::ColumnCallback callback) override + void forEachSubcolumnRecursively(IColumn::RecursiveColumnCallback callback) const override { - callback(column_holder); + callback(*column_holder); + column_holder->forEachSubcolumnRecursively(callback); + } + + void forEachSubcolumnRecursively(IColumn::RecursiveMutableColumnCallback callback) override + { + callback(*column_holder); column_holder->forEachSubcolumnRecursively(callback); reverse_index.setColumn(getRawColumnPtr()); if (is_nullable) diff --git a/src/Columns/IColumn.cpp b/src/Columns/IColumn.cpp index 2158adb86a8..46f07cffa9e 100644 --- a/src/Columns/IColumn.cpp +++ b/src/Columns/IColumn.cpp @@ -20,12 +20,10 @@ String IColumn::dumpStructure() const WriteBufferFromOwnString res; res << getFamilyName() << "(size = " << size(); - ColumnCallback callback = [&](ColumnPtr & subcolumn) + forEachSubcolumn([&](const auto & subcolumn) { res << ", " << subcolumn->dumpStructure(); - }; - - const_cast(this)->forEachSubcolumn(callback); + }); res << ")"; return res.str(); @@ -64,6 +62,22 @@ ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & defa return res; } +void IColumn::forEachSubcolumn(MutableColumnCallback callback) +{ + std::as_const(*this).forEachSubcolumn([&callback](const WrappedPtr & subcolumn) + { + callback(const_cast(subcolumn)); + }); +} + +void IColumn::forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) +{ + std::as_const(*this).forEachSubcolumnRecursively([&callback](const IColumn & subcolumn) + { + callback(const_cast(subcolumn)); + }); +} + bool isColumnNullable(const IColumn & column) { return checkColumn(column); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 461e41e3eec..94e87508619 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -411,11 +411,22 @@ public: /// If the column contains subcolumns (such as Array, Nullable, etc), do callback on them. /// Shallow: doesn't do recursive calls; don't do call for itself. - using ColumnCallback = std::function; - virtual void forEachSubcolumn(ColumnCallback) {} + + using ColumnCallback = std::function; + virtual void forEachSubcolumn(ColumnCallback) const {} + + using MutableColumnCallback = std::function; + virtual void forEachSubcolumn(MutableColumnCallback callback); /// Similar to forEachSubcolumn but it also do recursive calls. - virtual void forEachSubcolumnRecursively(ColumnCallback) {} + /// In recursive calls it's prohibited to replace pointers + /// to subcolumns, so we use another callback function. + + using RecursiveColumnCallback = std::function; + virtual void forEachSubcolumnRecursively(RecursiveColumnCallback) const {} + + using RecursiveMutableColumnCallback = std::function; + virtual void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback); /// Columns have equal structure. /// If true - you can use "compareAt", "insertFrom", etc. methods. diff --git a/src/Columns/tests/gtest_column_dump_structure.cpp b/src/Columns/tests/gtest_column_dump_structure.cpp new file mode 100644 index 00000000000..69769ba0db9 --- /dev/null +++ b/src/Columns/tests/gtest_column_dump_structure.cpp @@ -0,0 +1,27 @@ +#include +#include +#include +#include +#include + +using namespace DB; + +TEST(IColumn, dumpStructure) +{ + auto type_lc = std::make_shared(std::make_shared()); + ColumnPtr column_lc = type_lc->createColumn(); + auto expected_structure = "ColumnLowCardinality(size = 0, UInt8(size = 0), ColumnUnique(size = 1, String(size = 1)))"; + + std::vector threads; + for (size_t i = 0; i < 6; ++i) + { + threads.emplace_back([&] + { + for (size_t j = 0; j < 10000; ++j) + ASSERT_EQ(column_lc->dumpStructure(), expected_structure); + }); + } + + for (auto & t : threads) + t.join(); +} diff --git a/src/Interpreters/AggregationUtils.cpp b/src/Interpreters/AggregationUtils.cpp index ed5e1512a1f..4e870e8152b 100644 --- a/src/Interpreters/AggregationUtils.cpp +++ b/src/Interpreters/AggregationUtils.cpp @@ -50,14 +50,15 @@ OutputBlockColumns prepareOutputBlockColumns( if (aggregate_functions[i]->isState()) { - auto callback = [&](auto & subcolumn) + auto callback = [&](IColumn & subcolumn) { /// The ColumnAggregateFunction column captures the shared ownership of the arena with aggregate function states. - if (auto * column_aggregate_func = typeid_cast(subcolumn.get())) + if (auto * column_aggregate_func = typeid_cast(&subcolumn)) for (auto & pool : aggregates_pools) column_aggregate_func->addArena(pool); }; - callback(final_aggregate_columns[i]); + + callback(*final_aggregate_columns[i]); final_aggregate_columns[i]->forEachSubcolumnRecursively(callback); } } diff --git a/tests/queries/0_stateless/02482_insert_into_dist_race.reference b/tests/queries/0_stateless/02482_insert_into_dist_race.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02482_insert_into_dist_race.sql b/tests/queries/0_stateless/02482_insert_into_dist_race.sql new file mode 100644 index 00000000000..ee412a87d33 --- /dev/null +++ b/tests/queries/0_stateless/02482_insert_into_dist_race.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS tmp_02482; +DROP TABLE IF EXISTS dist_02482; + +-- This test produces warning +SET send_logs_level = 'error'; +SET prefer_localhost_replica=0; + +CREATE TABLE tmp_02482 (i UInt64, n LowCardinality(String)) ENGINE = Memory; +CREATE TABLE dist_02482(i UInt64, n LowCardinality(Nullable(String))) ENGINE = Distributed(test_cluster_two_shards, currentDatabase(), tmp_02482, i); + +SET insert_distributed_sync=1; + +INSERT INTO dist_02482 VALUES (1, '1'), (2, '2'); +INSERT INTO dist_02482 SELECT number, number FROM numbers(1000); + +SET insert_distributed_sync=0; + +SYSTEM STOP DISTRIBUTED SENDS dist_02482; + +INSERT INTO dist_02482 VALUES (1, '1'),(2, '2'); +INSERT INTO dist_02482 SELECT number, number FROM numbers(1000); + +SYSTEM FLUSH DISTRIBUTED dist_02482; + +DROP TABLE tmp_02482; +DROP TABLE dist_02482; From d9adf2f02d50a6c15229150516f90d94aa97a65f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Wed, 16 Nov 2022 09:34:04 +0800 Subject: [PATCH 47/78] fix doc --- docs/en/sql-reference/functions/math-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 47c27268b09..bcd118ce0be 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -555,7 +555,7 @@ Result: Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is UInt64. -The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater overflows the range for Int64 and will cause exception throw. +The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater will cause exception throw. **Syntax** From 6393d11dabe76d8e7f73a8fa5c6cc054f670a91b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Wed, 16 Nov 2022 09:34:45 +0800 Subject: [PATCH 48/78] fix doc --- src/Functions/factorial.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/factorial.cpp b/src/Functions/factorial.cpp index b76ef90a48d..4e96391bccd 100644 --- a/src/Functions/factorial.cpp +++ b/src/Functions/factorial.cpp @@ -103,7 +103,7 @@ REGISTER_FUNCTION(Factorial) R"( Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is UInt64. -The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater overflows the range for Int64 and will cause exception throw. +The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater will cause exception throw. )", Documentation::Examples{{"factorial", "SELECT factorial(10)"}}, Documentation::Categories{"Mathematical"}}, From 81971acf3551069aa87fceb757da146af24986ed Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Wed, 16 Nov 2022 02:32:44 +0000 Subject: [PATCH 49/78] Fix docs --- docs/en/engines/table-engines/log-family/index.md | 2 +- .../table-engines/mergetree-family/mergetree.md | 4 ++-- .../table-engines/mergetree-family/replication.md | 2 +- docs/en/engines/table-engines/special/join.md | 2 +- docs/en/operations/settings/index.md | 2 +- docs/en/operations/settings/settings.md | 14 +++++++------- docs/en/operations/system-tables/mutations.md | 4 ++-- docs/en/operations/system-tables/parts.md | 2 +- docs/en/operations/system-tables/parts_columns.md | 2 +- docs/en/sql-reference/statements/alter/column.md | 2 +- docs/en/sql-reference/statements/alter/delete.md | 6 +++--- .../en/sql-reference/statements/alter/partition.md | 4 ++-- .../sql-reference/statements/alter/projection.md | 6 +++--- .../statements/alter/skipping-index.md | 2 +- docs/en/sql-reference/statements/alter/update.md | 6 +++--- docs/ru/operations/settings/index.md | 2 +- docs/ru/operations/settings/settings.md | 2 +- docs/ru/sql-reference/statements/alter/column.md | 2 +- docs/ru/sql-reference/statements/insert-into.md | 2 +- docs/ru/sql-reference/statements/select/index.md | 2 +- docs/zh/sql-reference/statements/alter.md | 2 +- docs/zh/sql-reference/statements/insert-into.md | 2 +- 22 files changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/en/engines/table-engines/log-family/index.md b/docs/en/engines/table-engines/log-family/index.md index 486c41c2496..21f857510f7 100644 --- a/docs/en/engines/table-engines/log-family/index.md +++ b/docs/en/engines/table-engines/log-family/index.md @@ -28,7 +28,7 @@ Engines: During `INSERT` queries, the table is locked, and other queries for reading and writing data both wait for the table to unlock. If there are no data writing queries, any number of data reading queries can be performed concurrently. -- Do not support [mutations](/docs/en/sql-reference/statements/alter/index.md/#alter-mutations). +- Do not support [mutations](/docs/en/sql-reference/statements/alter/index.md#alter-mutations). - Do not support indexes. diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 7dfb5a9fed7..ce6cec079a3 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -537,7 +537,7 @@ TTL time_column TTL time_column + interval ``` -To define `interval`, use [time interval](/docs/en/sql-reference/operators/index.md/#operators-datetime) operators, for example: +To define `interval`, use [time interval](/docs/en/sql-reference/operators/index.md#operators-datetime) operators, for example: ``` sql TTL date_time + INTERVAL 1 MONTH @@ -860,7 +860,7 @@ The number of threads performing background moves of data parts can be changed b In the case of `MergeTree` tables, data is getting to disk in different ways: - As a result of an insert (`INSERT` query). -- During background merges and [mutations](/docs/en/sql-reference/statements/alter/index.md/#alter-mutations). +- During background merges and [mutations](/docs/en/sql-reference/statements/alter/index.md#alter-mutations). - When downloading from another replica. - As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](/docs/en/sql-reference/statements/alter/partition.md/#alter_freeze-partition). diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index 67b595d0fa0..ead1a76992e 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -20,7 +20,7 @@ Replication works at the level of an individual table, not the entire server. A Replication does not depend on sharding. Each shard has its own independent replication. -Compressed data for `INSERT` and `ALTER` queries is replicated (for more information, see the documentation for [ALTER](/docs/en/sql-reference/statements/alter/index.md/#query_language_queries_alter)). +Compressed data for `INSERT` and `ALTER` queries is replicated (for more information, see the documentation for [ALTER](/docs/en/sql-reference/statements/alter/index.md#query_language_queries_alter)). `CREATE`, `DROP`, `ATTACH`, `DETACH` and `RENAME` queries are executed on a single server and are not replicated: diff --git a/docs/en/engines/table-engines/special/join.md b/docs/en/engines/table-engines/special/join.md index 0e51a8b7696..a49214bd00a 100644 --- a/docs/en/engines/table-engines/special/join.md +++ b/docs/en/engines/table-engines/special/join.md @@ -59,7 +59,7 @@ Main use-cases for `Join`-engine tables are following: ### Deleting Data {#deleting-data} -`ALTER DELETE` queries for `Join`-engine tables are implemented as [mutations](/docs/en/sql-reference/statements/alter/index.md/#mutations). `DELETE` mutation reads filtered data and overwrites data of memory and disk. +`ALTER DELETE` queries for `Join`-engine tables are implemented as [mutations](/docs/en/sql-reference/statements/alter/index.md#mutations). `DELETE` mutation reads filtered data and overwrites data of memory and disk. ### Limitations and Settings {#join-limitations-and-settings} diff --git a/docs/en/operations/settings/index.md b/docs/en/operations/settings/index.md index 8603257ea55..eee4058c230 100644 --- a/docs/en/operations/settings/index.md +++ b/docs/en/operations/settings/index.md @@ -26,7 +26,7 @@ Ways to configure settings, in order of priority: - When starting the ClickHouse console client in non-interactive mode, set the startup parameter `--setting=value`. - When using the HTTP API, pass CGI parameters (`URL?setting_1=value&setting_2=value...`). - - Make settings in the [SETTINGS](../../sql-reference/statements/select/index.md#settings-in-select) clause of the SELECT query. The setting value is applied only to that query and is reset to default or previous value after the query is executed. + - Make settings in the [SETTINGS](../../sql-reference/statements/select/index.md#settings-in-select-query) clause of the SELECT query. The setting value is applied only to that query and is reset to default or previous value after the query is executed. Settings that can only be made in the server config file are not covered in this section. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 7abe4affbd1..a15a6e9bf4a 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -276,7 +276,7 @@ Default value: 0. Enables or disables the insertion of [default values](../../sql-reference/statements/create/table.md/#create-default-values) instead of [NULL](../../sql-reference/syntax.md/#null-literal) into columns with not [nullable](../../sql-reference/data-types/nullable.md/#data_type-nullable) data type. If column type is not nullable and this setting is disabled, then inserting `NULL` causes an exception. If column type is nullable, then `NULL` values are inserted as is, regardless of this setting. -This setting is applicable to [INSERT ... SELECT](../../sql-reference/statements/insert-into.md/#insert_query_insert-select) queries. Note that `SELECT` subqueries may be concatenated with `UNION ALL` clause. +This setting is applicable to [INSERT ... SELECT](../../sql-reference/statements/insert-into.md/#inserting-the-results-of-select) queries. Note that `SELECT` subqueries may be concatenated with `UNION ALL` clause. Possible values: @@ -1619,8 +1619,8 @@ These functions can be transformed: - [length](../../sql-reference/functions/array-functions.md/#array_functions-length) to read the [size0](../../sql-reference/data-types/array.md/#array-size) subcolumn. - [empty](../../sql-reference/functions/array-functions.md/#function-empty) to read the [size0](../../sql-reference/data-types/array.md/#array-size) subcolumn. - [notEmpty](../../sql-reference/functions/array-functions.md/#function-notempty) to read the [size0](../../sql-reference/data-types/array.md/#array-size) subcolumn. -- [isNull](../../sql-reference/operators/index.md/#operator-is-null) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. -- [isNotNull](../../sql-reference/operators/index.md/#is-not-null) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. +- [isNull](../../sql-reference/operators/index.md#operator-is-null) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. +- [isNotNull](../../sql-reference/operators/index.md#is-not-null) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. - [count](../../sql-reference/aggregate-functions/reference/count.md) to read the [null](../../sql-reference/data-types/nullable.md/#finding-null) subcolumn. - [mapKeys](../../sql-reference/functions/tuple-map-functions.md/#mapkeys) to read the [keys](../../sql-reference/data-types/map.md/#map-subcolumns) subcolumn. - [mapValues](../../sql-reference/functions/tuple-map-functions.md/#mapvalues) to read the [values](../../sql-reference/data-types/map.md/#map-subcolumns) subcolumn. @@ -2041,7 +2041,7 @@ Default value: 16. ## validate_polygons {#validate_polygons} -Enables or disables throwing an exception in the [pointInPolygon](../../sql-reference/functions/geo/index.md/#pointinpolygon) function, if the polygon is self-intersecting or self-tangent. +Enables or disables throwing an exception in the [pointInPolygon](../../sql-reference/functions/geo/index.md#pointinpolygon) function, if the polygon is self-intersecting or self-tangent. Possible values: @@ -2227,7 +2227,7 @@ Default value: `0`. ## mutations_sync {#mutations_sync} -Allows to execute `ALTER TABLE ... UPDATE|DELETE` queries ([mutations](../../sql-reference/statements/alter/index.md/#mutations)) synchronously. +Allows to execute `ALTER TABLE ... UPDATE|DELETE` queries ([mutations](../../sql-reference/statements/alter/index.md#mutations)) synchronously. Possible values: @@ -2239,8 +2239,8 @@ Default value: `0`. **See Also** -- [Synchronicity of ALTER Queries](../../sql-reference/statements/alter/index.md/#synchronicity-of-alter-queries) -- [Mutations](../../sql-reference/statements/alter/index.md/#mutations) +- [Synchronicity of ALTER Queries](../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) +- [Mutations](../../sql-reference/statements/alter/index.md#mutations) ## ttl_only_drop_parts {#ttl_only_drop_parts} diff --git a/docs/en/operations/system-tables/mutations.md b/docs/en/operations/system-tables/mutations.md index 0d3b764846b..d8fb91a63f5 100644 --- a/docs/en/operations/system-tables/mutations.md +++ b/docs/en/operations/system-tables/mutations.md @@ -3,7 +3,7 @@ slug: /en/operations/system-tables/mutations --- # mutations -The table contains information about [mutations](/docs/en/sql-reference/statements/alter/index.md/#mutations) of [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) tables and their progress. Each mutation command is represented by a single row. +The table contains information about [mutations](/docs/en/sql-reference/statements/alter/index.md#mutations) of [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) tables and their progress. Each mutation command is represented by a single row. Columns: @@ -45,7 +45,7 @@ If there were problems with mutating some data parts, the following columns cont **See Also** -- [Mutations](/docs/en/sql-reference/statements/alter/index.md/#mutations) +- [Mutations](/docs/en/sql-reference/statements/alter/index.md#mutations) - [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) table engine - [ReplicatedMergeTree](/docs/en/engines/table-engines/mergetree-family/replication.md) family diff --git a/docs/en/operations/system-tables/parts.md b/docs/en/operations/system-tables/parts.md index cbabd9b27b1..bbd5385f44b 100644 --- a/docs/en/operations/system-tables/parts.md +++ b/docs/en/operations/system-tables/parts.md @@ -9,7 +9,7 @@ Each row describes one data part. Columns: -- `partition` ([String](../../sql-reference/data-types/string.md)) – The partition name. To learn what a partition is, see the description of the [ALTER](../../sql-reference/statements/alter/index.md/#query_language_queries_alter) query. +- `partition` ([String](../../sql-reference/data-types/string.md)) – The partition name. To learn what a partition is, see the description of the [ALTER](../../sql-reference/statements/alter/index.md#query_language_queries_alter) query. Formats: diff --git a/docs/en/operations/system-tables/parts_columns.md b/docs/en/operations/system-tables/parts_columns.md index d934e01f245..68757ddfbff 100644 --- a/docs/en/operations/system-tables/parts_columns.md +++ b/docs/en/operations/system-tables/parts_columns.md @@ -9,7 +9,7 @@ Each row describes one data part. Columns: -- `partition` ([String](../../sql-reference/data-types/string.md)) — The partition name. To learn what a partition is, see the description of the [ALTER](../../sql-reference/statements/alter/index.md/#query_language_queries_alter) query. +- `partition` ([String](../../sql-reference/data-types/string.md)) — The partition name. To learn what a partition is, see the description of the [ALTER](../../sql-reference/statements/alter/index.md#query_language_queries_alter) query. Formats: diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index 6bca0dbff42..ae8671ffa9d 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -254,7 +254,7 @@ The `ALTER` query lets you create and delete separate elements (columns) in nest There is no support for deleting columns in the primary key or the sampling key (columns that are used in the `ENGINE` expression). Changing the type for columns that are included in the primary key is only possible if this change does not cause the data to be modified (for example, you are allowed to add values to an Enum or to change a type from `DateTime` to `UInt32`). -If the `ALTER` query is not sufficient to make the table changes you need, you can create a new table, copy the data to it using the [INSERT SELECT](/docs/en/sql-reference/statements/insert-into.md/#insert_query_insert-select) query, then switch the tables using the [RENAME](/docs/en/sql-reference/statements/rename.md/#rename-table) query and delete the old table. You can use the [clickhouse-copier](/docs/en/operations/utilities/clickhouse-copier.md) as an alternative to the `INSERT SELECT` query. +If the `ALTER` query is not sufficient to make the table changes you need, you can create a new table, copy the data to it using the [INSERT SELECT](/docs/en/sql-reference/statements/insert-into.md/#inserting-the-results-of-select) query, then switch the tables using the [RENAME](/docs/en/sql-reference/statements/rename.md/#rename-table) query and delete the old table. You can use the [clickhouse-copier](/docs/en/operations/utilities/clickhouse-copier.md) as an alternative to the `INSERT SELECT` query. The `ALTER` query blocks all reads and writes for the table. In other words, if a long `SELECT` is running at the time of the `ALTER` query, the `ALTER` query will wait for it to complete. At the same time, all new queries to the same table will wait while this `ALTER` is running. diff --git a/docs/en/sql-reference/statements/alter/delete.md b/docs/en/sql-reference/statements/alter/delete.md index 4dcab030d13..30ed96c0b9c 100644 --- a/docs/en/sql-reference/statements/alter/delete.md +++ b/docs/en/sql-reference/statements/alter/delete.md @@ -10,7 +10,7 @@ sidebar_label: DELETE ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr ``` -Deletes data matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md/#mutations). +Deletes data matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). :::note @@ -25,6 +25,6 @@ The synchronicity of the query processing is defined by the [mutations_sync](/do **See also** -- [Mutations](/docs/en/sql-reference/statements/alter/index.md/#mutations) -- [Synchronicity of ALTER Queries](/docs/en/sql-reference/statements/alter/index.md/#synchronicity-of-alter-queries) +- [Mutations](/docs/en/sql-reference/statements/alter/index.md#mutations) +- [Synchronicity of ALTER Queries](/docs/en/sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) - [mutations_sync](/docs/en/operations/settings/settings.md/#mutations_sync) setting diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 146c15e776e..a8cea63380c 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -270,7 +270,7 @@ ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' ## UPDATE IN PARTITION -Manipulates data in the specifies partition matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md/#mutations). +Manipulates data in the specifies partition matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). Syntax: @@ -290,7 +290,7 @@ ALTER TABLE mt UPDATE x = x + 1 IN PARTITION 2 WHERE p = 2; ## DELETE IN PARTITION -Deletes data in the specifies partition matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md/#mutations). +Deletes data in the specifies partition matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). Syntax: diff --git a/docs/en/sql-reference/statements/alter/projection.md b/docs/en/sql-reference/statements/alter/projection.md index 99cb8fb8fd1..3f6f493aa89 100644 --- a/docs/en/sql-reference/statements/alter/projection.md +++ b/docs/en/sql-reference/statements/alter/projection.md @@ -138,15 +138,15 @@ The following operations with [projections](/docs/en/engines/table-engines/merge ## DROP PROJECTION -`ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md/#mutations). +`ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). ## MATERIALIZE PROJECTION -`ALTER TABLE [db.]table MATERIALIZE PROJECTION name IN PARTITION partition_name` - The query rebuilds the projection `name` in the partition `partition_name`. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md/#mutations). +`ALTER TABLE [db.]table MATERIALIZE PROJECTION name IN PARTITION partition_name` - The query rebuilds the projection `name` in the partition `partition_name`. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). ## CLEAR PROJECTION -`ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md/#mutations). +`ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only change metadata or remove files. diff --git a/docs/en/sql-reference/statements/alter/skipping-index.md b/docs/en/sql-reference/statements/alter/skipping-index.md index 2dadffc4527..037e4bc38c5 100644 --- a/docs/en/sql-reference/statements/alter/skipping-index.md +++ b/docs/en/sql-reference/statements/alter/skipping-index.md @@ -14,7 +14,7 @@ The following operations are available: - `ALTER TABLE [db].table_name [ON CLUSTER cluster] DROP INDEX name` - Removes index description from tables metadata and deletes index files from disk. -- `ALTER TABLE [db.]table_name [ON CLUSTER cluster] MATERIALIZE INDEX name [IN PARTITION partition_name]` - Rebuilds the secondary index `name` for the specified `partition_name`. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md/#mutations). If `IN PARTITION` part is omitted then it rebuilds the index for the whole table data. +- `ALTER TABLE [db.]table_name [ON CLUSTER cluster] MATERIALIZE INDEX name [IN PARTITION partition_name]` - Rebuilds the secondary index `name` for the specified `partition_name`. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). If `IN PARTITION` part is omitted then it rebuilds the index for the whole table data. The first two commands are lightweight in a sense that they only change metadata or remove files. diff --git a/docs/en/sql-reference/statements/alter/update.md b/docs/en/sql-reference/statements/alter/update.md index f40b72f7ab3..5d27c382982 100644 --- a/docs/en/sql-reference/statements/alter/update.md +++ b/docs/en/sql-reference/statements/alter/update.md @@ -10,7 +10,7 @@ sidebar_label: UPDATE ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] WHERE filter_expr ``` -Manipulates data matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md/#mutations). +Manipulates data matching the specified filtering expression. Implemented as a [mutation](/docs/en/sql-reference/statements/alter/index.md#mutations). :::note The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. @@ -24,7 +24,7 @@ The synchronicity of the query processing is defined by the [mutations_sync](/do **See also** -- [Mutations](/docs/en/sql-reference/statements/alter/index.md/#mutations) -- [Synchronicity of ALTER Queries](/docs/en/sql-reference/statements/alter/index.md/#synchronicity-of-alter-queries) +- [Mutations](/docs/en/sql-reference/statements/alter/index.md#mutations) +- [Synchronicity of ALTER Queries](/docs/en/sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) - [mutations_sync](/docs/en/operations/settings/settings.md/#mutations_sync) setting diff --git a/docs/ru/operations/settings/index.md b/docs/ru/operations/settings/index.md index 4e055405847..6806aea5135 100644 --- a/docs/ru/operations/settings/index.md +++ b/docs/ru/operations/settings/index.md @@ -24,7 +24,7 @@ slug: /ru/operations/settings/ - При запуске консольного клиента ClickHouse в не интерактивном режиме установите параметр запуска `--setting=value`. - При использовании HTTP API передавайте cgi-параметры (`URL?setting_1=value&setting_2=value...`). - - Укажите необходимые настройки в секции [SETTINGS](../../sql-reference/statements/select/index.md#settings-in-select) запроса SELECT. Эти настройки действуют только в рамках данного запроса, а после его выполнения сбрасываются до предыдущего значения или значения по умолчанию. + - Укажите необходимые настройки в секции [SETTINGS](../../sql-reference/statements/select/index.md#settings-in-select-query) запроса SELECT. Эти настройки действуют только в рамках данного запроса, а после его выполнения сбрасываются до предыдущего значения или значения по умолчанию. Настройки, которые можно задать только в конфигурационном файле сервера, в разделе не рассматриваются. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index a070dbd5e10..58894611386 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -479,7 +479,7 @@ SELECT * FROM table_with_enum_column_for_tsv_insert; Включает или отключает вставку [значений по умолчанию](../../sql-reference/statements/create/table.md#create-default-values) вместо [NULL](../../sql-reference/syntax.md#null-literal) в столбцы, которые не позволяют [хранить NULL](../../sql-reference/data-types/nullable.md#data_type-nullable). Если столбец не позволяет хранить `NULL` и эта настройка отключена, то вставка `NULL` приведет к возникновению исключения. Если столбец позволяет хранить `NULL`, то значения `NULL` вставляются независимо от этой настройки. -Эта настройка используется для запросов [INSERT ... SELECT](../../sql-reference/statements/insert-into.md#insert_query_insert-select). При этом подзапросы `SELECT` могут объединяться с помощью `UNION ALL`. +Эта настройка используется для запросов [INSERT ... SELECT](../../sql-reference/statements/insert-into.md#inserting-the-results-of-select). При этом подзапросы `SELECT` могут объединяться с помощью `UNION ALL`. Возможные значения: diff --git a/docs/ru/sql-reference/statements/alter/column.md b/docs/ru/sql-reference/statements/alter/column.md index 11ec72596c4..a8ace213075 100644 --- a/docs/ru/sql-reference/statements/alter/column.md +++ b/docs/ru/sql-reference/statements/alter/column.md @@ -254,7 +254,7 @@ SELECT groupArray(x), groupArray(s) FROM tmp; Отсутствует возможность удалять столбцы, входящие в первичный ключ или ключ для сэмплирования (в общем, входящие в выражение `ENGINE`). Изменение типа у столбцов, входящих в первичный ключ возможно только в том случае, если это изменение не приводит к изменению данных (например, разрешено добавление значения в Enum или изменение типа с `DateTime` на `UInt32`). -Если возможностей запроса `ALTER` не хватает для нужного изменения таблицы, вы можете создать новую таблицу, скопировать туда данные с помощью запроса [INSERT SELECT](../insert-into.md#insert_query_insert-select), затем поменять таблицы местами с помощью запроса [RENAME](../rename.md#rename-table), и удалить старую таблицу. В качестве альтернативы для запроса `INSERT SELECT`, можно использовать инструмент [clickhouse-copier](../../../sql-reference/statements/alter/index.md). +Если возможностей запроса `ALTER` не хватает для нужного изменения таблицы, вы можете создать новую таблицу, скопировать туда данные с помощью запроса [INSERT SELECT](../insert-into.md#inserting-the-results-of-select), затем поменять таблицы местами с помощью запроса [RENAME](../rename.md#rename-table), и удалить старую таблицу. В качестве альтернативы для запроса `INSERT SELECT`, можно использовать инструмент [clickhouse-copier](../../../sql-reference/statements/alter/index.md). Запрос `ALTER` блокирует все чтения и записи для таблицы. То есть если на момент запроса `ALTER` выполнялся долгий `SELECT`, то запрос `ALTER` сначала дождётся его выполнения. И в это время все новые запросы к той же таблице будут ждать, пока завершится этот `ALTER`. diff --git a/docs/ru/sql-reference/statements/insert-into.md b/docs/ru/sql-reference/statements/insert-into.md index 573b8d39926..4fa6ac4ce66 100644 --- a/docs/ru/sql-reference/statements/insert-into.md +++ b/docs/ru/sql-reference/statements/insert-into.md @@ -95,7 +95,7 @@ INSERT INTO t FORMAT TabSeparated Если в таблице объявлены [ограничения](../../sql-reference/statements/create/table.md#constraints), то их выполнимость будет проверена для каждой вставляемой строки. Если для хотя бы одной строки ограничения не будут выполнены, запрос будет остановлен. -### Вставка результатов `SELECT` {#insert_query_insert-select} +### Вставка результатов `SELECT` {#inserting-the-results-of-select} **Синтаксис** diff --git a/docs/ru/sql-reference/statements/select/index.md b/docs/ru/sql-reference/statements/select/index.md index 4479e24000b..f360a09eb10 100644 --- a/docs/ru/sql-reference/statements/select/index.md +++ b/docs/ru/sql-reference/statements/select/index.md @@ -270,7 +270,7 @@ SELECT * REPLACE(i + 1 AS i) EXCEPT (j) APPLY(sum) from columns_transformers; └─────────────────┴────────┘ ``` -## SETTINGS в запросе SELECT {#settings-in-select} +## SETTINGS в запросе SELECT {#settings-in-select-query} Вы можете задать значения необходимых настроек непосредственно в запросе `SELECT` в секции `SETTINGS`. Эти настройки действуют только в рамках данного запроса, а после его выполнения сбрасываются до предыдущего значения или значения по умолчанию. diff --git a/docs/zh/sql-reference/statements/alter.md b/docs/zh/sql-reference/statements/alter.md index 23edfd633db..fd73be4fd93 100644 --- a/docs/zh/sql-reference/statements/alter.md +++ b/docs/zh/sql-reference/statements/alter.md @@ -150,7 +150,7 @@ ALTER TABLE visits MODIFY COLUMN browser Array(String) 不支持对primary key或者sampling key中的列(在 `ENGINE` 表达式中用到的列)进行删除操作。改变包含在primary key中的列的类型时,如果操作不会导致数据的变化(例如,往Enum中添加一个值,或者将`DateTime` 类型改成 `UInt32`),那么这种操作是可行的。 -如果 `ALTER` 操作不足以完成你想要的表变动操作,你可以创建一张新的表,通过 [INSERT SELECT](../../sql-reference/statements/insert-into.md#insert_query_insert-select)将数据拷贝进去,然后通过 [RENAME](../../sql-reference/statements/misc.md#misc_operations-rename)将新的表改成和原有表一样的名称,并删除原有的表。你可以使用 [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) 代替 `INSERT SELECT`。 +如果 `ALTER` 操作不足以完成你想要的表变动操作,你可以创建一张新的表,通过 [INSERT SELECT](../../sql-reference/statements/insert-into.md#inserting-the-results-of-select)将数据拷贝进去,然后通过 [RENAME](../../sql-reference/statements/misc.md#misc_operations-rename)将新的表改成和原有表一样的名称,并删除原有的表。你可以使用 [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) 代替 `INSERT SELECT`。 `ALTER` 操作会阻塞对表的所有读写操作。换句话说,当一个大的 `SELECT` 语句和 `ALTER`同时执行时,`ALTER`会等待,直到 `SELECT` 执行结束。与此同时,当 `ALTER` 运行时,新的 sql 语句将会等待。 diff --git a/docs/zh/sql-reference/statements/insert-into.md b/docs/zh/sql-reference/statements/insert-into.md index 69762bf43bc..f199329829c 100644 --- a/docs/zh/sql-reference/statements/insert-into.md +++ b/docs/zh/sql-reference/statements/insert-into.md @@ -90,7 +90,7 @@ INSERT INTO t FORMAT TabSeparated 如果表中有一些[限制](../../sql-reference/statements/create/table.mdx#constraints),,数据插入时会逐行进行数据校验,如果这里面包含了不符合限制条件的数据,服务将会抛出包含限制信息的异常,这个语句也会被停止执行。 -### 使用`SELECT`的结果写入 {#insert_query_insert-select} +### 使用`SELECT`的结果写入 {#inserting-the-results-of-select} ``` sql INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... From ce8066e90e5c708899f05b2f0544dde4bd324718 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 16 Nov 2022 07:31:24 +0100 Subject: [PATCH 50/78] Add changelog for 21.11 --- CHANGELOG.md | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68767612892..ac1f0793c04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ ### Table of Contents +**[ClickHouse release v22.11, 2022-11-17](#2211)**
**[ClickHouse release v22.10, 2022-10-25](#2210)**
**[ClickHouse release v22.9, 2022-09-22](#229)**
**[ClickHouse release v22.8-lts, 2022-08-18](#228)**
@@ -11,6 +12,108 @@ **[ClickHouse release v22.1, 2022-01-18](#221)**
**[Changelog for 2021](https://clickhouse.com/docs/en/whats-new/changelog/2021/)**
+###
ClickHouse release 22.11, 2022-11-17 + +#### Backward Incompatible Change +* `JSONExtract` family of functions will now attempt to coerce to the requested type. [#41502](https://github.com/ClickHouse/ClickHouse/pull/41502) ([Márcio Martins](https://github.com/marcioapm)). + +#### New Feature +* Support for retries during INSERTs into ReplicatedMergeTree if a session with ClickHouse Keeper is lost. Apart from fault tolerance, it aims to provide better user experience, - avoid returning a user an error during insert if keeper is restarted (for example, due to upgrade). [#42607](https://github.com/ClickHouse/ClickHouse/pull/42607) ([Igor Nikonov](https://github.com/devcrafter)). +* Add `Hudi` and `DeltaLake` table engines, read-only, only for tables on S3. [#41054](https://github.com/ClickHouse/ClickHouse/pull/41054) ([Daniil Rubin](https://github.com/rubin-do), [Kseniia Sumarokova](https://github.com/kssenii)). +* Add table function `hudi` and `deltaLake`. [#43080](https://github.com/ClickHouse/ClickHouse/pull/43080) ([flynn](https://github.com/ucasfl)). +* Support for composite time intervals. 1. Add, subtract and negate operations are now available on Intervals. In case when the types of Intervals are different they will be transformed into the Tuple of those types. 2. A tuple of intervals can be added to or subtracted from a Date/DateTime field. 3. Added parsing of Intervals with different types, for example: `INTERVAL '1 HOUR 1 MINUTE 1 SECOND'`. [#42195](https://github.com/ClickHouse/ClickHouse/pull/42195) ([Nikolay Degterinsky](https://github.com/evillique)). +* Added `**` glob support for recursive directory traversal to filesystem and S3. resolves [#36316](https://github.com/ClickHouse/ClickHouse/issues/36316). [#42376](https://github.com/ClickHouse/ClickHouse/pull/42376) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Introduce `s3_plain` disk type for write-once-read-many operations. Implement `ATTACH` of `MergeTree` table for `s3_plain` disk. [#42628](https://github.com/ClickHouse/ClickHouse/pull/42628) ([Azat Khuzhin](https://github.com/azat)). +* Added applied row-level policies to `system.query_log`. [#39819](https://github.com/ClickHouse/ClickHouse/pull/39819) ([Vladimir Chebotaryov](https://github.com/quickhouse)). +* Add four-letter command `csnp` for manually creating snapshots in ClickHouse Keeper. Additionally, `lgif` was added to get Raft information for a specific node (e.g. index of last created snapshot, last committed log index). [#41766](https://github.com/ClickHouse/ClickHouse/pull/41766) ([JackyWoo](https://github.com/JackyWoo)). +* Add function `ascii` like in spark: https://spark.apache.org/docs/latest/api/sql/#ascii. [#42670](https://github.com/ClickHouse/ClickHouse/pull/42670) ([李扬](https://github.com/taiyang-li)). +* Add function `pmod` which return non-negative result based on modulo. [#42755](https://github.com/ClickHouse/ClickHouse/pull/42755) ([李扬](https://github.com/taiyang-li)). +* Add function `formatReadableDecimalSize`. [#42774](https://github.com/ClickHouse/ClickHouse/pull/42774) ([Alejandro](https://github.com/alexon1234)). +* Add function `randCanonical`, which is similar to the `rand` function in spark or impala. The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). [#43124](https://github.com/ClickHouse/ClickHouse/pull/43124) ([李扬](https://github.com/taiyang-li)). +* Add function `displayName`, closes [#36770](https://github.com/ClickHouse/ClickHouse/issues/36770). [#37681](https://github.com/ClickHouse/ClickHouse/pull/37681) ([hongbin](https://github.com/xlwh)). +* Add `min_age_to_force_merge_on_partition_only` setting to optimize old parts for the entire partition only. [#42659](https://github.com/ClickHouse/ClickHouse/pull/42659) ([Antonio Andelic](https://github.com/antonio2368)). +* Add generic implementation for arbitrary structured named collections, access type and `system.named_collections`. [#43147](https://github.com/ClickHouse/ClickHouse/pull/43147) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Performance Improvement +* `match` function can use the index if it's a condition on string prefix. This closes [#37333](https://github.com/ClickHouse/ClickHouse/issues/37333). [#42458](https://github.com/ClickHouse/ClickHouse/pull/42458) ([clarkcaoliu](https://github.com/Clark0)). +* Speed up AND and OR operators when they are sequenced. [#42214](https://github.com/ClickHouse/ClickHouse/pull/42214) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* Support parallel parsing for `LineAsString` input format. This improves performance just slightly. This closes [#42502](https://github.com/ClickHouse/ClickHouse/issues/42502). [#42780](https://github.com/ClickHouse/ClickHouse/pull/42780) ([Kruglov Pavel](https://github.com/Avogar)). +* ClickHouse Keeper performance improvement: improve commit performance for cases when many different nodes have uncommitted states. This should help with cases when a follower node can't sync fast enough. [#42926](https://github.com/ClickHouse/ClickHouse/pull/42926) ([Antonio Andelic](https://github.com/antonio2368)). +* A condition like `NOT LIKE 'prefix%'` can use primary index. [#42209](https://github.com/ClickHouse/ClickHouse/pull/42209) ([Duc Canh Le](https://github.com/canhld94)). + +#### Experimental Feature +* Support type `Object` inside other types, e.g. `Array(JSON)`. [#36969](https://github.com/ClickHouse/ClickHouse/pull/36969) ([Anton Popov](https://github.com/CurtizJ)). +* Ignore MySQL binlog SAVEPOINT event for MaterializedMySQL. [#42931](https://github.com/ClickHouse/ClickHouse/pull/42931) ([zzsmdfj](https://github.com/zzsmdfj)). Handle (ignore) SAVEPOINT queries in MaterializedMySQL. [#43086](https://github.com/ClickHouse/ClickHouse/pull/43086) ([Stig Bakken](https://github.com/stigsb)). + +#### Improvement +* Trivial queries with small LIMIT will properly determine the number of estimated rows to read, so the threshold will be checked properly. Closes [#7071](https://github.com/ClickHouse/ClickHouse/issues/7071). [#42580](https://github.com/ClickHouse/ClickHouse/pull/42580) ([Han Fei](https://github.com/hanfei1991)). +* Add support for interactive parameters in INSERT VALUES queries. [#43077](https://github.com/ClickHouse/ClickHouse/pull/43077) ([Nikolay Degterinsky](https://github.com/evillique)). +* Added new field `allow_readonly` in `system.table_functions` to allow using table functions in readonly mode resolves [#42414](https://github.com/ClickHouse/ClickHouse/issues/42414) Implementation: * Added a new field allow_readonly to table system.table_functions. * Updated to use new field allow_readonly to allow using table functions in readonly mode. Testing: * Added a test for filesystem tests/queries/0_stateless/02473_functions_in_readonly_mode.sh Documentation: * Updated the english documentation for Table Functions. [#42708](https://github.com/ClickHouse/ClickHouse/pull/42708) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* The `system.asynchronous_metrics` gets embedded documentation. This documentation is also exported to Prometheus. Fixed an error with the metrics about `cache` disks - they were calculated only for one arbitrary cache disk instead all of them. This closes [#7644](https://github.com/ClickHouse/ClickHouse/issues/7644). [#43194](https://github.com/ClickHouse/ClickHouse/pull/43194) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Throttling algorithm changed to token bucket. [#42665](https://github.com/ClickHouse/ClickHouse/pull/42665) ([Sergei Trifonov](https://github.com/serxa)). +* Mask passwords and secret keys both in `system.query_log` and `/var/log/clickhouse-server/*.log` and also in error messages. [#42484](https://github.com/ClickHouse/ClickHouse/pull/42484) ([Vitaly Baranov](https://github.com/vitlibar)). +* Remove covered parts for fetched part (to avoid possible replication delay grows). [#39737](https://github.com/ClickHouse/ClickHouse/pull/39737) ([Azat Khuzhin](https://github.com/azat)). +* If `/dev/tty` is available, the progress in clickhouse-client and clickhouse-local will be rendered directly to the terminal, without writing to stderr. It allows to get progress even if stderr is redirected to a file, and the file will not be polluted by terminal escape sequences. The progress can be disabled by `--progress false`. This closes [#32238](https://github.com/ClickHouse/ClickHouse/issues/32238). [#42003](https://github.com/ClickHouse/ClickHouse/pull/42003) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add support for `FixedString` input to base64 coding functions. [#42285](https://github.com/ClickHouse/ClickHouse/pull/42285) ([ltrk2](https://github.com/ltrk2)). +* Add columns `bytes_on_disk` and `path` to `system.detached_parts`. Closes [#42264](https://github.com/ClickHouse/ClickHouse/issues/42264). [#42303](https://github.com/ClickHouse/ClickHouse/pull/42303) ([chen](https://github.com/xiedeyantu)). +* Improve using structure from insertion table in table functions, now setting `use_structure_from_insertion_table_in_table_functions` has new possible value - `2` that means that ClickHouse will try to determine if we can use structure from insertion table or not automatically. Closes [#40028](https://github.com/ClickHouse/ClickHouse/issues/40028). [#42320](https://github.com/ClickHouse/ClickHouse/pull/42320) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix no progress indication on INSERT FROM INFILE. Closes [#42548](https://github.com/ClickHouse/ClickHouse/issues/42548). [#42634](https://github.com/ClickHouse/ClickHouse/pull/42634) ([chen](https://github.com/xiedeyantu)). +* Refactor function `tokens` to enable max tokens returned for related functions (disabled by default). [#42673](https://github.com/ClickHouse/ClickHouse/pull/42673) ([李扬](https://github.com/taiyang-li)). +* Allow to use `Date32` arguments for `formatDateTime` and `FROM_UNIXTIME` functions. [#42737](https://github.com/ClickHouse/ClickHouse/pull/42737) ([Roman Vasin](https://github.com/rvasin)). +* Update tzdata to 2022f. Mexico will no longer observe DST except near the US border: https://www.timeanddate.com/news/time/mexico-abolishes-dst-2022.html. Chihuahua moves to year-round UTC-6 on 2022-10-30. Fiji no longer observes DST. See https://github.com/google/cctz/pull/235 and https://bugs.launchpad.net/ubuntu/+source/tzdata/+bug/1995209. [#42796](https://github.com/ClickHouse/ClickHouse/pull/42796) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add `FailedAsyncInsertQuery` event metric for async inserts. [#42814](https://github.com/ClickHouse/ClickHouse/pull/42814) ([Krzysztof Góralski](https://github.com/kgoralski)). +* Implement `read-in-order` optimization on top of query plan. It is enabled by default. Set `query_plan_read_in_order = 0` to use previous AST-based version. [#42829](https://github.com/ClickHouse/ClickHouse/pull/42829) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Increase the size of upload part exponentially for backup to S3 to avoid errors about max 10 000 parts limit of the multipart upload to s3. [#42833](https://github.com/ClickHouse/ClickHouse/pull/42833) ([Vitaly Baranov](https://github.com/vitlibar)). +* When the merge task is continuously busy and the disk space is insufficient, the completely expired parts cannot be selected and dropped, resulting in insufficient disk space. My idea is that when the entire Part expires, there is no need for additional disk space to guarantee, ensure the normal execution of TTL. [#42869](https://github.com/ClickHouse/ClickHouse/pull/42869) ([zhongyuankai](https://github.com/zhongyuankai)). +* Add `oss` function and `OSS` table engine (this is convenient for users). oss is fully compatible with s3. [#43155](https://github.com/ClickHouse/ClickHouse/pull/43155) ([zzsmdfj](https://github.com/zzsmdfj)). +* Improve error reporting in the collection of OS-related info for the `system.asynchronous_metrics` table. [#43192](https://github.com/ClickHouse/ClickHouse/pull/43192) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Modify the `INFORMATION_SCHEMA` tables in a way so that now ClickHouse can connect to itself using the MySQL compatibility protocol. Add columns instead of aliases (related to [#9769](https://github.com/ClickHouse/ClickHouse/issues/9769)). It will improve the compatibility with various MySQL clients. [#43198](https://github.com/ClickHouse/ClickHouse/pull/43198) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Add some functions for compatibility with PowerBI, when it connects using MySQL protocol [#42612](https://github.com/ClickHouse/ClickHouse/pull/42612) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Better usability for Dashboard on changes [#42872](https://github.com/ClickHouse/ClickHouse/pull/42872) ([Vladimir C](https://github.com/vdimir)). + +#### Build/Testing/Packaging Improvement +* Run SQLancer for each pull request and commit to master. [SQLancer](https://github.com/sqlancer/sqlancer) is an OpenSource fuzzer that focuses on automatic detection of logical bugs. [#42397](https://github.com/ClickHouse/ClickHouse/pull/42397) ([Ilya Yatsishin](https://github.com/qoega)). +* Update to latest zlib-ng. [#42463](https://github.com/ClickHouse/ClickHouse/pull/42463) ([Boris Kuschel](https://github.com/bkuschel)). +* Add support for testing ClickHouse server with Jepsen. By the way, we already have support for testing ClickHouse Keeper with Jepsen. This pull request extends it to Replicated tables. [#42619](https://github.com/ClickHouse/ClickHouse/pull/42619) ([Antonio Andelic](https://github.com/antonio2368)). +* Use https://github.com/matus-chochlik/ctcache for clang-tidy results caching. [#42913](https://github.com/ClickHouse/ClickHouse/pull/42913) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Before the fix, the user-defined config was preserved by RPM in `$file.rpmsave`. The PR fixes it and won't replace the user's files from packages. [#42936](https://github.com/ClickHouse/ClickHouse/pull/42936) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Remove some libraries from Ubuntu Docker image. [#42622](https://github.com/ClickHouse/ClickHouse/pull/42622) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Updated normaliser to clone the alias ast. resolves [#42452](https://github.com/ClickHouse/ClickHouse/issues/42452) Implementation: * Updated QueryNormalizer to clone alias ast, when its replaced. Previously just assigning the same leads to exception in LogicalExpressinsOptimizer as it would be the same parent being inserted again. * This bug is not seen with new analyser (allow_experimental_analyzer), so no changes for it. I added a test for the same. [#42827](https://github.com/ClickHouse/ClickHouse/pull/42827) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix race for backup of tables in `Lazy` databases. [#43104](https://github.com/ClickHouse/ClickHouse/pull/43104) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix for `skip_unavailable_shards`: it did not work with the `s3Cluster` table function. [#43131](https://github.com/ClickHouse/ClickHouse/pull/43131) ([chen](https://github.com/xiedeyantu)). +* Fix schema inference in `s3Cluster` and improvement in `hdfsCluster`. [#41979](https://github.com/ClickHouse/ClickHouse/pull/41979) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix retries while reading from URL table engines / table function. (retrtiable errors could be retries more times than needed, non-retrialble errors resulted in failed assertion in code). [#42224](https://github.com/ClickHouse/ClickHouse/pull/42224) ([Kseniia Sumarokova](https://github.com/kssenii)). +* A segmentation fault related to DNS & c-ares has been reported and fixed. [#42234](https://github.com/ClickHouse/ClickHouse/pull/42234) ([Arthur Passos](https://github.com/arthurpassos)). +* Fix `LOGICAL_ERROR` `Arguments of 'plus' have incorrect data types` which may happen in PK analysis (monotonicity check). Fix invalid PK analysis for monotonic binary functions with first constant argument. [#42410](https://github.com/ClickHouse/ClickHouse/pull/42410) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix incorrect key analysis when key types cannot be inside Nullable. This fixes [#42456](https://github.com/ClickHouse/ClickHouse/issues/42456). [#42469](https://github.com/ClickHouse/ClickHouse/pull/42469) ([Amos Bird](https://github.com/amosbird)). +* Fix typo in setting name that led to bad usage of schema inference cache while using setting `input_format_csv_use_best_effort_in_schema_inference`. Closes [#41735](https://github.com/ClickHouse/ClickHouse/issues/41735). [#42536](https://github.com/ClickHouse/ClickHouse/pull/42536) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix creating a Set with wrong header when data type is LowCardinality. Closes [#42460](https://github.com/ClickHouse/ClickHouse/issues/42460). [#42579](https://github.com/ClickHouse/ClickHouse/pull/42579) ([flynn](https://github.com/ucasfl)). +* `(U)Int128` and `(U)Int256` values were correctly checked in `PREWHERE`. [#42605](https://github.com/ClickHouse/ClickHouse/pull/42605) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix a bug in functions parser that could have led to a segmentation fault. [#42724](https://github.com/ClickHouse/ClickHouse/pull/42724) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix the locking in `truncate table`. [#42728](https://github.com/ClickHouse/ClickHouse/pull/42728) ([flynn](https://github.com/ucasfl)). +* Fix possible crash in `web` disks when file does not exist (or `OPTIMIZE TABLE FINAL`, that also can got the same error eventually). [#42767](https://github.com/ClickHouse/ClickHouse/pull/42767) ([Azat Khuzhin](https://github.com/azat)). +* Fix `auth_type` mapping in `system.session_log`, by including `SSL_CERTIFICATE` for the enum values. [#42782](https://github.com/ClickHouse/ClickHouse/pull/42782) ([Miel Donkers](https://github.com/mdonkers)). +* Fix stack-use-after-return under ASAN build in the Create User query parser. [#42804](https://github.com/ClickHouse/ClickHouse/pull/42804) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix `lowerUTF8`/`upperUTF8` in case of symbol was in between 16-byte boundary (very frequent case of you have strings > 16 bytes long). [#42812](https://github.com/ClickHouse/ClickHouse/pull/42812) ([Azat Khuzhin](https://github.com/azat)). +* Additional bound check was added to lz4 decompression routine to fix misbehaviour in case of malformed input. [#42868](https://github.com/ClickHouse/ClickHouse/pull/42868) ([Nikita Taranov](https://github.com/nickitat)). +* Fix rare possible hung on query cancellation. [#42874](https://github.com/ClickHouse/ClickHouse/pull/42874) ([Azat Khuzhin](https://github.com/azat)). +* Fix incorrect behavior with multiple disjuncts in hash join, close [#42832](https://github.com/ClickHouse/ClickHouse/issues/42832). [#42876](https://github.com/ClickHouse/ClickHouse/pull/42876) ([Vladimir C](https://github.com/vdimir)). +* A null pointer will be generated when select if as from ‘three table join’ , For example, this SQL query: [#42883](https://github.com/ClickHouse/ClickHouse/pull/42883) ([zzsmdfj](https://github.com/zzsmdfj)). +* Fix memory sanitizer report in Cluster Discovery, close [#42763](https://github.com/ClickHouse/ClickHouse/issues/42763). [#42905](https://github.com/ClickHouse/ClickHouse/pull/42905) ([Vladimir C](https://github.com/vdimir)). +* Improve datetime schema inference in case of empty string. [#42911](https://github.com/ClickHouse/ClickHouse/pull/42911) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix rare NOT_FOUND_COLUMN_IN_BLOCK error when projection is possible to use but there is no projection available. This fixes [#42771](https://github.com/ClickHouse/ClickHouse/issues/42771) . The bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/25563. [#42938](https://github.com/ClickHouse/ClickHouse/pull/42938) ([Amos Bird](https://github.com/amosbird)). +* Fix ATTACH TABLE in `PostgreSQL` database engine if the table contains DATETIME data type. Closes [#42817](https://github.com/ClickHouse/ClickHouse/issues/42817). [#42960](https://github.com/ClickHouse/ClickHouse/pull/42960) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix lambda parsing. Closes [#41848](https://github.com/ClickHouse/ClickHouse/issues/41848). [#42979](https://github.com/ClickHouse/ClickHouse/pull/42979) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix incorrect key analysis when nullable keys appear in the middle of a hyperrectangle. This fixes [#43111](https://github.com/ClickHouse/ClickHouse/issues/43111) . [#43133](https://github.com/ClickHouse/ClickHouse/pull/43133) ([Amos Bird](https://github.com/amosbird)). +* Fix several buffer over-reads in deserialization of carefully crafted aggregate function states. [#43159](https://github.com/ClickHouse/ClickHouse/pull/43159) ([Raúl Marín](https://github.com/Algunenano)). +* Fix function `if` in case of NULL and const Nullable arguments. Closes [#43069](https://github.com/ClickHouse/ClickHouse/issues/43069). [#43178](https://github.com/ClickHouse/ClickHouse/pull/43178) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix decimal math overflow in parsing datetime with the 'best effort' algorithm. Closes [#43061](https://github.com/ClickHouse/ClickHouse/issues/43061). [#43180](https://github.com/ClickHouse/ClickHouse/pull/43180) ([Kruglov Pavel](https://github.com/Avogar)). +* The `indent` field produced by the `git-import` tool was miscalculated. See https://clickhouse.com/docs/en/getting-started/example-datasets/github/. [#43191](https://github.com/ClickHouse/ClickHouse/pull/43191) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fixed unexpected behaviour of `Interval` types with subquery and casting. [#43193](https://github.com/ClickHouse/ClickHouse/pull/43193) ([jh0x](https://github.com/jh0x)). + ### ClickHouse release 22.10, 2022-10-26 #### Backward Incompatible Change From 122508d844b44283679ff6753e16f665f23d1726 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Wed, 16 Nov 2022 10:22:28 +0000 Subject: [PATCH 51/78] Fix cloning of ASTIdentifier --- src/Parsers/ASTIdentifier.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Parsers/ASTIdentifier.cpp b/src/Parsers/ASTIdentifier.cpp index ca8ac0e8e64..341ac44b56e 100644 --- a/src/Parsers/ASTIdentifier.cpp +++ b/src/Parsers/ASTIdentifier.cpp @@ -63,6 +63,7 @@ ASTPtr ASTIdentifier::clone() const { auto ret = std::make_shared(*this); ret->semantic = std::make_shared(*ret->semantic); + ret->cloneChildren(); return ret; } From 981e6e2c24b96ef3e3516273d04864658fda1d72 Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Wed, 16 Nov 2022 12:44:54 +0100 Subject: [PATCH 52/78] Update src/Analyzer/Passes/QueryAnalysisPass.cpp --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 760d036ab4f..2b76376c4c9 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -2177,8 +2177,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier } else { - alias_name_to_node_map.erase(it); - std::unordered_set valid_identifiers; collectScopeWithParentScopesValidIdentifiersForTypoCorrection(identifier, scope, true, false, false, valid_identifiers); From 2d265523bbaf5dd1aff1f60ddae52640098da5cf Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Wed, 16 Nov 2022 12:45:54 +0100 Subject: [PATCH 53/78] Update src/Analyzer/Passes/QueryAnalysisPass.cpp --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 2b76376c4c9..138ff721f99 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -2171,11 +2171,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier auto & alias_identifier_node = it->second->as(); auto identifier = alias_identifier_node.getIdentifier(); auto lookup_result = tryResolveIdentifier(IdentifierLookup{identifier, identifier_lookup.lookup_context}, scope, identifier_resolve_settings); - if (lookup_result.isResolved()) - { - it->second = lookup_result.resolved_identifier; - } - else + if (!lookup_result.isResolved()) { std::unordered_set valid_identifiers; collectScopeWithParentScopesValidIdentifiersForTypoCorrection(identifier, scope, true, false, false, valid_identifiers); @@ -2187,6 +2183,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier scope.scope_node->formatASTForErrorMessage(), getHintsErrorMessageSuffix(hints)); } + it->second = lookup_result.resolved_identifier; /** During collection of aliases if node is identifier and has alias, we cannot say if it is * column or function node. Check QueryExpressionsAliasVisitor documentation for clarification. From b13a8d478bb9c1d1e9f333c26acb6b33a22c6d9b Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 16 Nov 2022 11:53:59 +0100 Subject: [PATCH 54/78] check limits for an AST in select parser fuzzer --- src/Parsers/fuzzers/select_parser_fuzzer.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Parsers/fuzzers/select_parser_fuzzer.cpp b/src/Parsers/fuzzers/select_parser_fuzzer.cpp index caa6c586cd6..3f712834c55 100644 --- a/src/Parsers/fuzzers/select_parser_fuzzer.cpp +++ b/src/Parsers/fuzzers/select_parser_fuzzer.cpp @@ -12,7 +12,15 @@ try std::string input = std::string(reinterpret_cast(data), size); DB::ParserQueryWithOutput parser(input.data() + input.size()); - DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 1000); + + const UInt64 max_parser_depth = 1000; + DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, max_parser_depth); + + const UInt64 max_ast_depth = 1000; + ast->checkDepth(max_ast_depth); + + const UInt64 max_ast_elements = 50000; + ast->checkSize(max_ast_elements); DB::WriteBufferFromOwnString wb; DB::formatAST(*ast, wb); From dcb76f6bbf1db37b15da40bab450760e7115bcea Mon Sep 17 00:00:00 2001 From: filimonov <1549571+filimonov@users.noreply.github.com> Date: Wed, 16 Nov 2022 14:16:42 +0100 Subject: [PATCH 55/78] Allow autoremoval of old parts if detach_not_byte_identical_parts enabled Allow autoremoval of old & detached parts if detach_not_byte_identical_parts enabled. See also #28708 #37975 --- src/Storages/MergeTree/MergeTreePartInfo.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreePartInfo.h b/src/Storages/MergeTree/MergeTreePartInfo.h index c19cc55e74e..60c7e4e8822 100644 --- a/src/Storages/MergeTree/MergeTreePartInfo.h +++ b/src/Storages/MergeTree/MergeTreePartInfo.h @@ -154,6 +154,8 @@ struct DetachedPartInfo : public MergeTreePartInfo "deleting", "tmp-fetch", "covered-by-broken", + "merge-not-byte-identical", + "mutate-not-byte-identical" }); static constexpr auto DETACHED_REASONS_REMOVABLE_BY_TIMEOUT = std::to_array({ @@ -163,7 +165,9 @@ struct DetachedPartInfo : public MergeTreePartInfo "ignored", "broken-on-start", "deleting", - "clone" + "clone", + "merge-not-byte-identical", + "mutate-not-byte-identical" }); /// NOTE: It may parse part info incorrectly. From ea8c7df296deeac1c5f65416d03c73f2b044089f Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 16 Nov 2022 09:32:13 -0500 Subject: [PATCH 56/78] edits --- CHANGELOG.md | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac1f0793c04..051bde44dd2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,18 +18,18 @@ * `JSONExtract` family of functions will now attempt to coerce to the requested type. [#41502](https://github.com/ClickHouse/ClickHouse/pull/41502) ([Márcio Martins](https://github.com/marcioapm)). #### New Feature -* Support for retries during INSERTs into ReplicatedMergeTree if a session with ClickHouse Keeper is lost. Apart from fault tolerance, it aims to provide better user experience, - avoid returning a user an error during insert if keeper is restarted (for example, due to upgrade). [#42607](https://github.com/ClickHouse/ClickHouse/pull/42607) ([Igor Nikonov](https://github.com/devcrafter)). +* Adds support for retries during INSERTs into ReplicatedMergeTree when a session with ClickHouse Keeper is lost. Apart from fault tolerance, it aims to provide better user experience, - avoid returning a user an error during insert if keeper is restarted (for example, due to upgrade). [#42607](https://github.com/ClickHouse/ClickHouse/pull/42607) ([Igor Nikonov](https://github.com/devcrafter)). * Add `Hudi` and `DeltaLake` table engines, read-only, only for tables on S3. [#41054](https://github.com/ClickHouse/ClickHouse/pull/41054) ([Daniil Rubin](https://github.com/rubin-do), [Kseniia Sumarokova](https://github.com/kssenii)). * Add table function `hudi` and `deltaLake`. [#43080](https://github.com/ClickHouse/ClickHouse/pull/43080) ([flynn](https://github.com/ucasfl)). -* Support for composite time intervals. 1. Add, subtract and negate operations are now available on Intervals. In case when the types of Intervals are different they will be transformed into the Tuple of those types. 2. A tuple of intervals can be added to or subtracted from a Date/DateTime field. 3. Added parsing of Intervals with different types, for example: `INTERVAL '1 HOUR 1 MINUTE 1 SECOND'`. [#42195](https://github.com/ClickHouse/ClickHouse/pull/42195) ([Nikolay Degterinsky](https://github.com/evillique)). -* Added `**` glob support for recursive directory traversal to filesystem and S3. resolves [#36316](https://github.com/ClickHouse/ClickHouse/issues/36316). [#42376](https://github.com/ClickHouse/ClickHouse/pull/42376) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Support for composite time intervals. 1. Add, subtract and negate operations are now available on Intervals. In the case where the types of Intervals are different, they will be transformed into the Tuple of those types. 2. A tuple of intervals can be added to or subtracted from a Date/DateTime field. 3. Added parsing of Intervals with different types, for example: `INTERVAL '1 HOUR 1 MINUTE 1 SECOND'`. [#42195](https://github.com/ClickHouse/ClickHouse/pull/42195) ([Nikolay Degterinsky](https://github.com/evillique)). +* Added `**` glob support for recursive directory traversal of the filesystem and S3. Resolves [#36316](https://github.com/ClickHouse/ClickHouse/issues/36316). [#42376](https://github.com/ClickHouse/ClickHouse/pull/42376) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). * Introduce `s3_plain` disk type for write-once-read-many operations. Implement `ATTACH` of `MergeTree` table for `s3_plain` disk. [#42628](https://github.com/ClickHouse/ClickHouse/pull/42628) ([Azat Khuzhin](https://github.com/azat)). * Added applied row-level policies to `system.query_log`. [#39819](https://github.com/ClickHouse/ClickHouse/pull/39819) ([Vladimir Chebotaryov](https://github.com/quickhouse)). * Add four-letter command `csnp` for manually creating snapshots in ClickHouse Keeper. Additionally, `lgif` was added to get Raft information for a specific node (e.g. index of last created snapshot, last committed log index). [#41766](https://github.com/ClickHouse/ClickHouse/pull/41766) ([JackyWoo](https://github.com/JackyWoo)). -* Add function `ascii` like in spark: https://spark.apache.org/docs/latest/api/sql/#ascii. [#42670](https://github.com/ClickHouse/ClickHouse/pull/42670) ([李扬](https://github.com/taiyang-li)). -* Add function `pmod` which return non-negative result based on modulo. [#42755](https://github.com/ClickHouse/ClickHouse/pull/42755) ([李扬](https://github.com/taiyang-li)). +* Add function `ascii` like in Apache Spark: https://spark.apache.org/docs/latest/api/sql/#ascii. [#42670](https://github.com/ClickHouse/ClickHouse/pull/42670) ([李扬](https://github.com/taiyang-li)). +* Add function `pmod` which returns non-negative result based on modulo. [#42755](https://github.com/ClickHouse/ClickHouse/pull/42755) ([李扬](https://github.com/taiyang-li)). * Add function `formatReadableDecimalSize`. [#42774](https://github.com/ClickHouse/ClickHouse/pull/42774) ([Alejandro](https://github.com/alexon1234)). -* Add function `randCanonical`, which is similar to the `rand` function in spark or impala. The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). [#43124](https://github.com/ClickHouse/ClickHouse/pull/43124) ([李扬](https://github.com/taiyang-li)). +* Add function `randCanonical`, which is similar to the `rand` function in Apache Spark or Impala. The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). [#43124](https://github.com/ClickHouse/ClickHouse/pull/43124) ([李扬](https://github.com/taiyang-li)). * Add function `displayName`, closes [#36770](https://github.com/ClickHouse/ClickHouse/issues/36770). [#37681](https://github.com/ClickHouse/ClickHouse/pull/37681) ([hongbin](https://github.com/xlwh)). * Add `min_age_to_force_merge_on_partition_only` setting to optimize old parts for the entire partition only. [#42659](https://github.com/ClickHouse/ClickHouse/pull/42659) ([Antonio Andelic](https://github.com/antonio2368)). * Add generic implementation for arbitrary structured named collections, access type and `system.named_collections`. [#43147](https://github.com/ClickHouse/ClickHouse/pull/43147) ([Kseniia Sumarokova](https://github.com/kssenii)). @@ -39,21 +39,21 @@ * Speed up AND and OR operators when they are sequenced. [#42214](https://github.com/ClickHouse/ClickHouse/pull/42214) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). * Support parallel parsing for `LineAsString` input format. This improves performance just slightly. This closes [#42502](https://github.com/ClickHouse/ClickHouse/issues/42502). [#42780](https://github.com/ClickHouse/ClickHouse/pull/42780) ([Kruglov Pavel](https://github.com/Avogar)). * ClickHouse Keeper performance improvement: improve commit performance for cases when many different nodes have uncommitted states. This should help with cases when a follower node can't sync fast enough. [#42926](https://github.com/ClickHouse/ClickHouse/pull/42926) ([Antonio Andelic](https://github.com/antonio2368)). -* A condition like `NOT LIKE 'prefix%'` can use primary index. [#42209](https://github.com/ClickHouse/ClickHouse/pull/42209) ([Duc Canh Le](https://github.com/canhld94)). +* A condition like `NOT LIKE 'prefix%'` can use the primary index. [#42209](https://github.com/ClickHouse/ClickHouse/pull/42209) ([Duc Canh Le](https://github.com/canhld94)). #### Experimental Feature * Support type `Object` inside other types, e.g. `Array(JSON)`. [#36969](https://github.com/ClickHouse/ClickHouse/pull/36969) ([Anton Popov](https://github.com/CurtizJ)). * Ignore MySQL binlog SAVEPOINT event for MaterializedMySQL. [#42931](https://github.com/ClickHouse/ClickHouse/pull/42931) ([zzsmdfj](https://github.com/zzsmdfj)). Handle (ignore) SAVEPOINT queries in MaterializedMySQL. [#43086](https://github.com/ClickHouse/ClickHouse/pull/43086) ([Stig Bakken](https://github.com/stigsb)). #### Improvement -* Trivial queries with small LIMIT will properly determine the number of estimated rows to read, so the threshold will be checked properly. Closes [#7071](https://github.com/ClickHouse/ClickHouse/issues/7071). [#42580](https://github.com/ClickHouse/ClickHouse/pull/42580) ([Han Fei](https://github.com/hanfei1991)). +* Trivial queries with small LIMIT will properly determine the number of estimated rows to read, so that the threshold will be checked properly. Closes [#7071](https://github.com/ClickHouse/ClickHouse/issues/7071). [#42580](https://github.com/ClickHouse/ClickHouse/pull/42580) ([Han Fei](https://github.com/hanfei1991)). * Add support for interactive parameters in INSERT VALUES queries. [#43077](https://github.com/ClickHouse/ClickHouse/pull/43077) ([Nikolay Degterinsky](https://github.com/evillique)). -* Added new field `allow_readonly` in `system.table_functions` to allow using table functions in readonly mode resolves [#42414](https://github.com/ClickHouse/ClickHouse/issues/42414) Implementation: * Added a new field allow_readonly to table system.table_functions. * Updated to use new field allow_readonly to allow using table functions in readonly mode. Testing: * Added a test for filesystem tests/queries/0_stateless/02473_functions_in_readonly_mode.sh Documentation: * Updated the english documentation for Table Functions. [#42708](https://github.com/ClickHouse/ClickHouse/pull/42708) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Added new field `allow_readonly` in `system.table_functions` to allow using table functions in readonly mode. Resolves [#42414](https://github.com/ClickHouse/ClickHouse/issues/42414) Implementation: * Added a new field allow_readonly to table system.table_functions. * Updated to use new field allow_readonly to allow using table functions in readonly mode. Testing: * Added a test for filesystem tests/queries/0_stateless/02473_functions_in_readonly_mode.sh Documentation: * Updated the english documentation for Table Functions. [#42708](https://github.com/ClickHouse/ClickHouse/pull/42708) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). * The `system.asynchronous_metrics` gets embedded documentation. This documentation is also exported to Prometheus. Fixed an error with the metrics about `cache` disks - they were calculated only for one arbitrary cache disk instead all of them. This closes [#7644](https://github.com/ClickHouse/ClickHouse/issues/7644). [#43194](https://github.com/ClickHouse/ClickHouse/pull/43194) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Throttling algorithm changed to token bucket. [#42665](https://github.com/ClickHouse/ClickHouse/pull/42665) ([Sergei Trifonov](https://github.com/serxa)). * Mask passwords and secret keys both in `system.query_log` and `/var/log/clickhouse-server/*.log` and also in error messages. [#42484](https://github.com/ClickHouse/ClickHouse/pull/42484) ([Vitaly Baranov](https://github.com/vitlibar)). * Remove covered parts for fetched part (to avoid possible replication delay grows). [#39737](https://github.com/ClickHouse/ClickHouse/pull/39737) ([Azat Khuzhin](https://github.com/azat)). -* If `/dev/tty` is available, the progress in clickhouse-client and clickhouse-local will be rendered directly to the terminal, without writing to stderr. It allows to get progress even if stderr is redirected to a file, and the file will not be polluted by terminal escape sequences. The progress can be disabled by `--progress false`. This closes [#32238](https://github.com/ClickHouse/ClickHouse/issues/32238). [#42003](https://github.com/ClickHouse/ClickHouse/pull/42003) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* If `/dev/tty` is available, the progress in clickhouse-client and clickhouse-local will be rendered directly to the terminal, without writing to STDERR. It allows getting progress even if STDERR is redirected to a file, and the file will not be polluted by terminal escape sequences. The progress can be disabled by `--progress false`. This closes [#32238](https://github.com/ClickHouse/ClickHouse/issues/32238). [#42003](https://github.com/ClickHouse/ClickHouse/pull/42003) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Add support for `FixedString` input to base64 coding functions. [#42285](https://github.com/ClickHouse/ClickHouse/pull/42285) ([ltrk2](https://github.com/ltrk2)). * Add columns `bytes_on_disk` and `path` to `system.detached_parts`. Closes [#42264](https://github.com/ClickHouse/ClickHouse/issues/42264). [#42303](https://github.com/ClickHouse/ClickHouse/pull/42303) ([chen](https://github.com/xiedeyantu)). * Improve using structure from insertion table in table functions, now setting `use_structure_from_insertion_table_in_table_functions` has new possible value - `2` that means that ClickHouse will try to determine if we can use structure from insertion table or not automatically. Closes [#40028](https://github.com/ClickHouse/ClickHouse/issues/40028). [#42320](https://github.com/ClickHouse/ClickHouse/pull/42320) ([Kruglov Pavel](https://github.com/Avogar)). @@ -67,7 +67,7 @@ * When the merge task is continuously busy and the disk space is insufficient, the completely expired parts cannot be selected and dropped, resulting in insufficient disk space. My idea is that when the entire Part expires, there is no need for additional disk space to guarantee, ensure the normal execution of TTL. [#42869](https://github.com/ClickHouse/ClickHouse/pull/42869) ([zhongyuankai](https://github.com/zhongyuankai)). * Add `oss` function and `OSS` table engine (this is convenient for users). oss is fully compatible with s3. [#43155](https://github.com/ClickHouse/ClickHouse/pull/43155) ([zzsmdfj](https://github.com/zzsmdfj)). * Improve error reporting in the collection of OS-related info for the `system.asynchronous_metrics` table. [#43192](https://github.com/ClickHouse/ClickHouse/pull/43192) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Modify the `INFORMATION_SCHEMA` tables in a way so that now ClickHouse can connect to itself using the MySQL compatibility protocol. Add columns instead of aliases (related to [#9769](https://github.com/ClickHouse/ClickHouse/issues/9769)). It will improve the compatibility with various MySQL clients. [#43198](https://github.com/ClickHouse/ClickHouse/pull/43198) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Modify the `INFORMATION_SCHEMA` tables in a way so that ClickHouse can connect to itself using the MySQL compatibility protocol. Add columns instead of aliases (related to [#9769](https://github.com/ClickHouse/ClickHouse/issues/9769)). It will improve the compatibility with various MySQL clients. [#43198](https://github.com/ClickHouse/ClickHouse/pull/43198) ([Filatenkov Artur](https://github.com/FArthur-cmd)). * Add some functions for compatibility with PowerBI, when it connects using MySQL protocol [#42612](https://github.com/ClickHouse/ClickHouse/pull/42612) ([Filatenkov Artur](https://github.com/FArthur-cmd)). * Better usability for Dashboard on changes [#42872](https://github.com/ClickHouse/ClickHouse/pull/42872) ([Vladimir C](https://github.com/vdimir)). @@ -81,15 +81,15 @@ #### Bug Fix (user-visible misbehavior in official stable or prestable release) -* Updated normaliser to clone the alias ast. resolves [#42452](https://github.com/ClickHouse/ClickHouse/issues/42452) Implementation: * Updated QueryNormalizer to clone alias ast, when its replaced. Previously just assigning the same leads to exception in LogicalExpressinsOptimizer as it would be the same parent being inserted again. * This bug is not seen with new analyser (allow_experimental_analyzer), so no changes for it. I added a test for the same. [#42827](https://github.com/ClickHouse/ClickHouse/pull/42827) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Updated normaliser to clone the alias ast. Resolves [#42452](https://github.com/ClickHouse/ClickHouse/issues/42452) Implementation: * Updated QueryNormalizer to clone alias ast, when its replaced. Previously just assigning the same leads to exception in LogicalExpressinsOptimizer as it would be the same parent being inserted again. * This bug is not seen with new analyser (allow_experimental_analyzer), so no changes for it. I added a test for the same. [#42827](https://github.com/ClickHouse/ClickHouse/pull/42827) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). * Fix race for backup of tables in `Lazy` databases. [#43104](https://github.com/ClickHouse/ClickHouse/pull/43104) ([Vitaly Baranov](https://github.com/vitlibar)). * Fix for `skip_unavailable_shards`: it did not work with the `s3Cluster` table function. [#43131](https://github.com/ClickHouse/ClickHouse/pull/43131) ([chen](https://github.com/xiedeyantu)). * Fix schema inference in `s3Cluster` and improvement in `hdfsCluster`. [#41979](https://github.com/ClickHouse/ClickHouse/pull/41979) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix retries while reading from URL table engines / table function. (retrtiable errors could be retries more times than needed, non-retrialble errors resulted in failed assertion in code). [#42224](https://github.com/ClickHouse/ClickHouse/pull/42224) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix retries while reading from URL table engines / table function. (retriable errors could be retries more times than needed, non-retriable errors resulted in failed assertion in code). [#42224](https://github.com/ClickHouse/ClickHouse/pull/42224) ([Kseniia Sumarokova](https://github.com/kssenii)). * A segmentation fault related to DNS & c-ares has been reported and fixed. [#42234](https://github.com/ClickHouse/ClickHouse/pull/42234) ([Arthur Passos](https://github.com/arthurpassos)). * Fix `LOGICAL_ERROR` `Arguments of 'plus' have incorrect data types` which may happen in PK analysis (monotonicity check). Fix invalid PK analysis for monotonic binary functions with first constant argument. [#42410](https://github.com/ClickHouse/ClickHouse/pull/42410) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Fix incorrect key analysis when key types cannot be inside Nullable. This fixes [#42456](https://github.com/ClickHouse/ClickHouse/issues/42456). [#42469](https://github.com/ClickHouse/ClickHouse/pull/42469) ([Amos Bird](https://github.com/amosbird)). -* Fix typo in setting name that led to bad usage of schema inference cache while using setting `input_format_csv_use_best_effort_in_schema_inference`. Closes [#41735](https://github.com/ClickHouse/ClickHouse/issues/41735). [#42536](https://github.com/ClickHouse/ClickHouse/pull/42536) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix typo in a setting name that led to bad usage of schema inference cache while using setting `input_format_csv_use_best_effort_in_schema_inference`. Closes [#41735](https://github.com/ClickHouse/ClickHouse/issues/41735). [#42536](https://github.com/ClickHouse/ClickHouse/pull/42536) ([Kruglov Pavel](https://github.com/Avogar)). * Fix creating a Set with wrong header when data type is LowCardinality. Closes [#42460](https://github.com/ClickHouse/ClickHouse/issues/42460). [#42579](https://github.com/ClickHouse/ClickHouse/pull/42579) ([flynn](https://github.com/ucasfl)). * `(U)Int128` and `(U)Int256` values were correctly checked in `PREWHERE`. [#42605](https://github.com/ClickHouse/ClickHouse/pull/42605) ([Antonio Andelic](https://github.com/antonio2368)). * Fix a bug in functions parser that could have led to a segmentation fault. [#42724](https://github.com/ClickHouse/ClickHouse/pull/42724) ([Nikolay Degterinsky](https://github.com/evillique)). @@ -98,19 +98,19 @@ * Fix `auth_type` mapping in `system.session_log`, by including `SSL_CERTIFICATE` for the enum values. [#42782](https://github.com/ClickHouse/ClickHouse/pull/42782) ([Miel Donkers](https://github.com/mdonkers)). * Fix stack-use-after-return under ASAN build in the Create User query parser. [#42804](https://github.com/ClickHouse/ClickHouse/pull/42804) ([Nikolay Degterinsky](https://github.com/evillique)). * Fix `lowerUTF8`/`upperUTF8` in case of symbol was in between 16-byte boundary (very frequent case of you have strings > 16 bytes long). [#42812](https://github.com/ClickHouse/ClickHouse/pull/42812) ([Azat Khuzhin](https://github.com/azat)). -* Additional bound check was added to lz4 decompression routine to fix misbehaviour in case of malformed input. [#42868](https://github.com/ClickHouse/ClickHouse/pull/42868) ([Nikita Taranov](https://github.com/nickitat)). -* Fix rare possible hung on query cancellation. [#42874](https://github.com/ClickHouse/ClickHouse/pull/42874) ([Azat Khuzhin](https://github.com/azat)). +* Additional bound check was added to LZ4 decompression routine to fix misbehaviour in case of malformed input. [#42868](https://github.com/ClickHouse/ClickHouse/pull/42868) ([Nikita Taranov](https://github.com/nickitat)). +* Fix rare possible hang on query cancellation. [#42874](https://github.com/ClickHouse/ClickHouse/pull/42874) ([Azat Khuzhin](https://github.com/azat)). * Fix incorrect behavior with multiple disjuncts in hash join, close [#42832](https://github.com/ClickHouse/ClickHouse/issues/42832). [#42876](https://github.com/ClickHouse/ClickHouse/pull/42876) ([Vladimir C](https://github.com/vdimir)). * A null pointer will be generated when select if as from ‘three table join’ , For example, this SQL query: [#42883](https://github.com/ClickHouse/ClickHouse/pull/42883) ([zzsmdfj](https://github.com/zzsmdfj)). * Fix memory sanitizer report in Cluster Discovery, close [#42763](https://github.com/ClickHouse/ClickHouse/issues/42763). [#42905](https://github.com/ClickHouse/ClickHouse/pull/42905) ([Vladimir C](https://github.com/vdimir)). -* Improve datetime schema inference in case of empty string. [#42911](https://github.com/ClickHouse/ClickHouse/pull/42911) ([Kruglov Pavel](https://github.com/Avogar)). +* Improve DateTime schema inference in case of empty string. [#42911](https://github.com/ClickHouse/ClickHouse/pull/42911) ([Kruglov Pavel](https://github.com/Avogar)). * Fix rare NOT_FOUND_COLUMN_IN_BLOCK error when projection is possible to use but there is no projection available. This fixes [#42771](https://github.com/ClickHouse/ClickHouse/issues/42771) . The bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/25563. [#42938](https://github.com/ClickHouse/ClickHouse/pull/42938) ([Amos Bird](https://github.com/amosbird)). * Fix ATTACH TABLE in `PostgreSQL` database engine if the table contains DATETIME data type. Closes [#42817](https://github.com/ClickHouse/ClickHouse/issues/42817). [#42960](https://github.com/ClickHouse/ClickHouse/pull/42960) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix lambda parsing. Closes [#41848](https://github.com/ClickHouse/ClickHouse/issues/41848). [#42979](https://github.com/ClickHouse/ClickHouse/pull/42979) ([Nikolay Degterinsky](https://github.com/evillique)). * Fix incorrect key analysis when nullable keys appear in the middle of a hyperrectangle. This fixes [#43111](https://github.com/ClickHouse/ClickHouse/issues/43111) . [#43133](https://github.com/ClickHouse/ClickHouse/pull/43133) ([Amos Bird](https://github.com/amosbird)). * Fix several buffer over-reads in deserialization of carefully crafted aggregate function states. [#43159](https://github.com/ClickHouse/ClickHouse/pull/43159) ([Raúl Marín](https://github.com/Algunenano)). * Fix function `if` in case of NULL and const Nullable arguments. Closes [#43069](https://github.com/ClickHouse/ClickHouse/issues/43069). [#43178](https://github.com/ClickHouse/ClickHouse/pull/43178) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix decimal math overflow in parsing datetime with the 'best effort' algorithm. Closes [#43061](https://github.com/ClickHouse/ClickHouse/issues/43061). [#43180](https://github.com/ClickHouse/ClickHouse/pull/43180) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix decimal math overflow in parsing DateTime with the 'best effort' algorithm. Closes [#43061](https://github.com/ClickHouse/ClickHouse/issues/43061). [#43180](https://github.com/ClickHouse/ClickHouse/pull/43180) ([Kruglov Pavel](https://github.com/Avogar)). * The `indent` field produced by the `git-import` tool was miscalculated. See https://clickhouse.com/docs/en/getting-started/example-datasets/github/. [#43191](https://github.com/ClickHouse/ClickHouse/pull/43191) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Fixed unexpected behaviour of `Interval` types with subquery and casting. [#43193](https://github.com/ClickHouse/ClickHouse/pull/43193) ([jh0x](https://github.com/jh0x)). @@ -673,7 +673,7 @@ * Support SQL standard CREATE INDEX and DROP INDEX syntax. [#35166](https://github.com/ClickHouse/ClickHouse/pull/35166) ([Jianmei Zhang](https://github.com/zhangjmruc)). * Send profile events for INSERT queries (previously only SELECT was supported). [#37391](https://github.com/ClickHouse/ClickHouse/pull/37391) ([Azat Khuzhin](https://github.com/azat)). * Implement in order aggregation (`optimize_aggregation_in_order`) for fully materialized projections. [#37469](https://github.com/ClickHouse/ClickHouse/pull/37469) ([Azat Khuzhin](https://github.com/azat)). -* Remove subprocess run for kerberos initialization. Added new integration test. Closes [#27651](https://github.com/ClickHouse/ClickHouse/issues/27651). [#38105](https://github.com/ClickHouse/ClickHouse/pull/38105) ([Roman Vasin](https://github.com/rvasin)). +* Remove subprocess run for Kerberos initialization. Added new integration test. Closes [#27651](https://github.com/ClickHouse/ClickHouse/issues/27651). [#38105](https://github.com/ClickHouse/ClickHouse/pull/38105) ([Roman Vasin](https://github.com/rvasin)). * * Add setting `multiple_joins_try_to_keep_original_names` to not rewrite identifier name on multiple JOINs rewrite, close [#34697](https://github.com/ClickHouse/ClickHouse/issues/34697). [#38149](https://github.com/ClickHouse/ClickHouse/pull/38149) ([Vladimir C](https://github.com/vdimir)). * Improved trace-visualizer UX. [#38169](https://github.com/ClickHouse/ClickHouse/pull/38169) ([Sergei Trifonov](https://github.com/serxa)). * Enable stack trace collection and query profiler for AArch64. [#38181](https://github.com/ClickHouse/ClickHouse/pull/38181) ([Maksim Kita](https://github.com/kitaisreal)). @@ -953,8 +953,8 @@ #### Upgrade Notes -* Now, background merges, mutations and `OPTIMIZE` will not increment `SelectedRows` and `SelectedBytes` metrics. They (still) will increment `MergedRows` and `MergedUncompressedBytes` as it was before. This only affects the metric values, and makes them better. This change does not introduce any incompatibility, but you may wonder about the changes of metrics, so we put in this category. [#37040](https://github.com/ClickHouse/ClickHouse/pull/37040) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Updated the BoringSSL module to the official FIPS compliant version. This makes ClickHouse FIPS compliant. [#35914](https://github.com/ClickHouse/ClickHouse/pull/35914) ([Meena-Renganathan](https://github.com/Meena-Renganathan)). The ciphers `aes-192-cfb128` and `aes-256-cfb128` were removed, because they are not included in the FIPS certified version of BoringSSL. +* Now, background merges, mutations, and `OPTIMIZE` will not increment `SelectedRows` and `SelectedBytes` metrics. They (still) will increment `MergedRows` and `MergedUncompressedBytes` as it was before. This only affects the metric values and makes them better. This change does not introduce any incompatibility, but you may wonder about the changes to the metrics, so we put in this category. [#37040](https://github.com/ClickHouse/ClickHouse/pull/37040) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Updated the BoringSSL module to the official FIPS compliant version. This makes ClickHouse FIPS compliant in this area. [#35914](https://github.com/ClickHouse/ClickHouse/pull/35914) ([Meena-Renganathan](https://github.com/Meena-Renganathan)). The ciphers `aes-192-cfb128` and `aes-256-cfb128` were removed, because they are not included in the FIPS certified version of BoringSSL. * `max_memory_usage` setting is removed from the default user profile in `users.xml`. This enables flexible memory limits for queries instead of the old rigid limit of 10 GB. * Disable `log_query_threads` setting by default. It controls the logging of statistics about every thread participating in query execution. After supporting asynchronous reads, the total number of distinct thread ids became too large, and logging into the `query_thread_log` has become too heavy. [#37077](https://github.com/ClickHouse/ClickHouse/pull/37077) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Remove function `groupArraySorted` which has a bug. [#36822](https://github.com/ClickHouse/ClickHouse/pull/36822) ([Alexey Milovidov](https://github.com/alexey-milovidov)). From b280b683332bcd1e0689eec0be9c684f03d1dd3f Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 16 Nov 2022 15:57:50 +0100 Subject: [PATCH 57/78] Fix style. --- src/Backups/BackupSettings.cpp | 2 +- src/Backups/RestoreSettings.cpp | 2 +- src/Parsers/ParserBackupQuery.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index bed34f2fbcc..5266296c248 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -128,7 +128,7 @@ void BackupSettings::copySettingsToQuery(ASTBackupQuery & query) const auto base_backup_name = base_backup_info ? base_backup_info->toAST() : nullptr; if (base_backup_name) - query.setOrReplace(query.base_backup_name, std::move(base_backup_name)); + query.setOrReplace(query.base_backup_name, base_backup_name); else query.reset(query.base_backup_name); diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp index 322954abc2b..2c06ee907b5 100644 --- a/src/Backups/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -216,7 +216,7 @@ void RestoreSettings::copySettingsToQuery(ASTBackupQuery & query) const auto base_backup_name = base_backup_info ? base_backup_info->toAST() : nullptr; if (base_backup_name) - query.setOrReplace(query.base_backup_name, std::move(base_backup_name)); + query.setOrReplace(query.base_backup_name, base_backup_name); else query.reset(query.base_backup_name); diff --git a/src/Parsers/ParserBackupQuery.cpp b/src/Parsers/ParserBackupQuery.cpp index 78bc10511b2..cead1a98c1a 100644 --- a/src/Parsers/ParserBackupQuery.cpp +++ b/src/Parsers/ParserBackupQuery.cpp @@ -365,13 +365,13 @@ bool ParserBackupQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) query->cluster = std::move(cluster); if (backup_name) - query->set(query->backup_name, std::move(backup_name)); + query->set(query->backup_name, backup_name); query->settings = std::move(settings); query->cluster_host_ids = std::move(cluster_host_ids); if (base_backup_name) - query->set(query->base_backup_name, std::move(base_backup_name)); + query->set(query->base_backup_name, base_backup_name); return true; } From bc19d350090bb37e446c27d17d04f1707f901522 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Wed, 16 Nov 2022 15:34:36 +0000 Subject: [PATCH 58/78] fix clang-tidy --- src/Columns/ColumnObject.cpp | 6 +++--- src/Columns/ColumnTuple.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index c3e8af5a8cf..b2adf9e0059 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -666,14 +666,14 @@ size_t ColumnObject::allocatedBytes() const void ColumnObject::forEachSubcolumn(ColumnCallback callback) const { - for (auto & entry : subcolumns) - for (auto & part : entry->data.data) + for (const auto & entry : subcolumns) + for (const auto & part : entry->data.data) callback(part); } void ColumnObject::forEachSubcolumnRecursively(RecursiveColumnCallback callback) const { - for (auto & entry : subcolumns) + for (const auto & entry : subcolumns) { for (const auto & part : entry->data.data) { diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index c8ac2c67c36..8827b7b9850 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -497,13 +497,13 @@ void ColumnTuple::getExtremes(Field & min, Field & max) const void ColumnTuple::forEachSubcolumn(ColumnCallback callback) const { - for (auto & column : columns) + for (const auto & column : columns) callback(column); } void ColumnTuple::forEachSubcolumnRecursively(RecursiveColumnCallback callback) const { - for (auto & column : columns) + for (const auto & column : columns) { callback(*column); column->forEachSubcolumnRecursively(callback); From 5e087ae967db4b4cd1e2642689a971c8e7d69c28 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 16 Nov 2022 11:54:06 -0500 Subject: [PATCH 59/78] link to tutorial --- .../dictionaries/external-dictionaries/external-dicts.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md index 06b5b8a6746..9f922a2cccb 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md @@ -7,7 +7,9 @@ import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dict # Dictionaries - +:::tip Tutorial +If you are getting started with Dictionaries in ClickHouse we have a tutorial that covers that topic. Take a look [here](/docs/en/tutorial.md). +::: You can add your own dictionaries from various data sources. The source for a dictionary can be a ClickHouse table, a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Dictionary Sources](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md)”. @@ -27,6 +29,8 @@ The [dictionaries](../../../operations/system-tables/dictionaries.md#system_tabl - Configuration parameters. - Metrics like amount of RAM allocated for the dictionary or a number of queries since the dictionary was successfully loaded. + + ## Creating a dictionary with a DDL query Dictionaries can be created with [DDL queries](../../../sql-reference/statements/create/dictionary.md), and this is the recommended method because with DDL created dictionaries: From 4016fd2efcb196a97bfc157f661acf6c165f1fa8 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 16 Nov 2022 11:58:10 -0500 Subject: [PATCH 60/78] remove changes --- CHANGELOG.md | 109 ++------------------------------------------------- 1 file changed, 3 insertions(+), 106 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 051bde44dd2..68767612892 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,4 @@ ### Table of Contents -**[ClickHouse release v22.11, 2022-11-17](#2211)**
**[ClickHouse release v22.10, 2022-10-25](#2210)**
**[ClickHouse release v22.9, 2022-09-22](#229)**
**[ClickHouse release v22.8-lts, 2022-08-18](#228)**
@@ -12,108 +11,6 @@ **[ClickHouse release v22.1, 2022-01-18](#221)**
**[Changelog for 2021](https://clickhouse.com/docs/en/whats-new/changelog/2021/)**
-### ClickHouse release 22.11, 2022-11-17 - -#### Backward Incompatible Change -* `JSONExtract` family of functions will now attempt to coerce to the requested type. [#41502](https://github.com/ClickHouse/ClickHouse/pull/41502) ([Márcio Martins](https://github.com/marcioapm)). - -#### New Feature -* Adds support for retries during INSERTs into ReplicatedMergeTree when a session with ClickHouse Keeper is lost. Apart from fault tolerance, it aims to provide better user experience, - avoid returning a user an error during insert if keeper is restarted (for example, due to upgrade). [#42607](https://github.com/ClickHouse/ClickHouse/pull/42607) ([Igor Nikonov](https://github.com/devcrafter)). -* Add `Hudi` and `DeltaLake` table engines, read-only, only for tables on S3. [#41054](https://github.com/ClickHouse/ClickHouse/pull/41054) ([Daniil Rubin](https://github.com/rubin-do), [Kseniia Sumarokova](https://github.com/kssenii)). -* Add table function `hudi` and `deltaLake`. [#43080](https://github.com/ClickHouse/ClickHouse/pull/43080) ([flynn](https://github.com/ucasfl)). -* Support for composite time intervals. 1. Add, subtract and negate operations are now available on Intervals. In the case where the types of Intervals are different, they will be transformed into the Tuple of those types. 2. A tuple of intervals can be added to or subtracted from a Date/DateTime field. 3. Added parsing of Intervals with different types, for example: `INTERVAL '1 HOUR 1 MINUTE 1 SECOND'`. [#42195](https://github.com/ClickHouse/ClickHouse/pull/42195) ([Nikolay Degterinsky](https://github.com/evillique)). -* Added `**` glob support for recursive directory traversal of the filesystem and S3. Resolves [#36316](https://github.com/ClickHouse/ClickHouse/issues/36316). [#42376](https://github.com/ClickHouse/ClickHouse/pull/42376) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Introduce `s3_plain` disk type for write-once-read-many operations. Implement `ATTACH` of `MergeTree` table for `s3_plain` disk. [#42628](https://github.com/ClickHouse/ClickHouse/pull/42628) ([Azat Khuzhin](https://github.com/azat)). -* Added applied row-level policies to `system.query_log`. [#39819](https://github.com/ClickHouse/ClickHouse/pull/39819) ([Vladimir Chebotaryov](https://github.com/quickhouse)). -* Add four-letter command `csnp` for manually creating snapshots in ClickHouse Keeper. Additionally, `lgif` was added to get Raft information for a specific node (e.g. index of last created snapshot, last committed log index). [#41766](https://github.com/ClickHouse/ClickHouse/pull/41766) ([JackyWoo](https://github.com/JackyWoo)). -* Add function `ascii` like in Apache Spark: https://spark.apache.org/docs/latest/api/sql/#ascii. [#42670](https://github.com/ClickHouse/ClickHouse/pull/42670) ([李扬](https://github.com/taiyang-li)). -* Add function `pmod` which returns non-negative result based on modulo. [#42755](https://github.com/ClickHouse/ClickHouse/pull/42755) ([李扬](https://github.com/taiyang-li)). -* Add function `formatReadableDecimalSize`. [#42774](https://github.com/ClickHouse/ClickHouse/pull/42774) ([Alejandro](https://github.com/alexon1234)). -* Add function `randCanonical`, which is similar to the `rand` function in Apache Spark or Impala. The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). [#43124](https://github.com/ClickHouse/ClickHouse/pull/43124) ([李扬](https://github.com/taiyang-li)). -* Add function `displayName`, closes [#36770](https://github.com/ClickHouse/ClickHouse/issues/36770). [#37681](https://github.com/ClickHouse/ClickHouse/pull/37681) ([hongbin](https://github.com/xlwh)). -* Add `min_age_to_force_merge_on_partition_only` setting to optimize old parts for the entire partition only. [#42659](https://github.com/ClickHouse/ClickHouse/pull/42659) ([Antonio Andelic](https://github.com/antonio2368)). -* Add generic implementation for arbitrary structured named collections, access type and `system.named_collections`. [#43147](https://github.com/ClickHouse/ClickHouse/pull/43147) ([Kseniia Sumarokova](https://github.com/kssenii)). - -#### Performance Improvement -* `match` function can use the index if it's a condition on string prefix. This closes [#37333](https://github.com/ClickHouse/ClickHouse/issues/37333). [#42458](https://github.com/ClickHouse/ClickHouse/pull/42458) ([clarkcaoliu](https://github.com/Clark0)). -* Speed up AND and OR operators when they are sequenced. [#42214](https://github.com/ClickHouse/ClickHouse/pull/42214) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). -* Support parallel parsing for `LineAsString` input format. This improves performance just slightly. This closes [#42502](https://github.com/ClickHouse/ClickHouse/issues/42502). [#42780](https://github.com/ClickHouse/ClickHouse/pull/42780) ([Kruglov Pavel](https://github.com/Avogar)). -* ClickHouse Keeper performance improvement: improve commit performance for cases when many different nodes have uncommitted states. This should help with cases when a follower node can't sync fast enough. [#42926](https://github.com/ClickHouse/ClickHouse/pull/42926) ([Antonio Andelic](https://github.com/antonio2368)). -* A condition like `NOT LIKE 'prefix%'` can use the primary index. [#42209](https://github.com/ClickHouse/ClickHouse/pull/42209) ([Duc Canh Le](https://github.com/canhld94)). - -#### Experimental Feature -* Support type `Object` inside other types, e.g. `Array(JSON)`. [#36969](https://github.com/ClickHouse/ClickHouse/pull/36969) ([Anton Popov](https://github.com/CurtizJ)). -* Ignore MySQL binlog SAVEPOINT event for MaterializedMySQL. [#42931](https://github.com/ClickHouse/ClickHouse/pull/42931) ([zzsmdfj](https://github.com/zzsmdfj)). Handle (ignore) SAVEPOINT queries in MaterializedMySQL. [#43086](https://github.com/ClickHouse/ClickHouse/pull/43086) ([Stig Bakken](https://github.com/stigsb)). - -#### Improvement -* Trivial queries with small LIMIT will properly determine the number of estimated rows to read, so that the threshold will be checked properly. Closes [#7071](https://github.com/ClickHouse/ClickHouse/issues/7071). [#42580](https://github.com/ClickHouse/ClickHouse/pull/42580) ([Han Fei](https://github.com/hanfei1991)). -* Add support for interactive parameters in INSERT VALUES queries. [#43077](https://github.com/ClickHouse/ClickHouse/pull/43077) ([Nikolay Degterinsky](https://github.com/evillique)). -* Added new field `allow_readonly` in `system.table_functions` to allow using table functions in readonly mode. Resolves [#42414](https://github.com/ClickHouse/ClickHouse/issues/42414) Implementation: * Added a new field allow_readonly to table system.table_functions. * Updated to use new field allow_readonly to allow using table functions in readonly mode. Testing: * Added a test for filesystem tests/queries/0_stateless/02473_functions_in_readonly_mode.sh Documentation: * Updated the english documentation for Table Functions. [#42708](https://github.com/ClickHouse/ClickHouse/pull/42708) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* The `system.asynchronous_metrics` gets embedded documentation. This documentation is also exported to Prometheus. Fixed an error with the metrics about `cache` disks - they were calculated only for one arbitrary cache disk instead all of them. This closes [#7644](https://github.com/ClickHouse/ClickHouse/issues/7644). [#43194](https://github.com/ClickHouse/ClickHouse/pull/43194) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Throttling algorithm changed to token bucket. [#42665](https://github.com/ClickHouse/ClickHouse/pull/42665) ([Sergei Trifonov](https://github.com/serxa)). -* Mask passwords and secret keys both in `system.query_log` and `/var/log/clickhouse-server/*.log` and also in error messages. [#42484](https://github.com/ClickHouse/ClickHouse/pull/42484) ([Vitaly Baranov](https://github.com/vitlibar)). -* Remove covered parts for fetched part (to avoid possible replication delay grows). [#39737](https://github.com/ClickHouse/ClickHouse/pull/39737) ([Azat Khuzhin](https://github.com/azat)). -* If `/dev/tty` is available, the progress in clickhouse-client and clickhouse-local will be rendered directly to the terminal, without writing to STDERR. It allows getting progress even if STDERR is redirected to a file, and the file will not be polluted by terminal escape sequences. The progress can be disabled by `--progress false`. This closes [#32238](https://github.com/ClickHouse/ClickHouse/issues/32238). [#42003](https://github.com/ClickHouse/ClickHouse/pull/42003) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add support for `FixedString` input to base64 coding functions. [#42285](https://github.com/ClickHouse/ClickHouse/pull/42285) ([ltrk2](https://github.com/ltrk2)). -* Add columns `bytes_on_disk` and `path` to `system.detached_parts`. Closes [#42264](https://github.com/ClickHouse/ClickHouse/issues/42264). [#42303](https://github.com/ClickHouse/ClickHouse/pull/42303) ([chen](https://github.com/xiedeyantu)). -* Improve using structure from insertion table in table functions, now setting `use_structure_from_insertion_table_in_table_functions` has new possible value - `2` that means that ClickHouse will try to determine if we can use structure from insertion table or not automatically. Closes [#40028](https://github.com/ClickHouse/ClickHouse/issues/40028). [#42320](https://github.com/ClickHouse/ClickHouse/pull/42320) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix no progress indication on INSERT FROM INFILE. Closes [#42548](https://github.com/ClickHouse/ClickHouse/issues/42548). [#42634](https://github.com/ClickHouse/ClickHouse/pull/42634) ([chen](https://github.com/xiedeyantu)). -* Refactor function `tokens` to enable max tokens returned for related functions (disabled by default). [#42673](https://github.com/ClickHouse/ClickHouse/pull/42673) ([李扬](https://github.com/taiyang-li)). -* Allow to use `Date32` arguments for `formatDateTime` and `FROM_UNIXTIME` functions. [#42737](https://github.com/ClickHouse/ClickHouse/pull/42737) ([Roman Vasin](https://github.com/rvasin)). -* Update tzdata to 2022f. Mexico will no longer observe DST except near the US border: https://www.timeanddate.com/news/time/mexico-abolishes-dst-2022.html. Chihuahua moves to year-round UTC-6 on 2022-10-30. Fiji no longer observes DST. See https://github.com/google/cctz/pull/235 and https://bugs.launchpad.net/ubuntu/+source/tzdata/+bug/1995209. [#42796](https://github.com/ClickHouse/ClickHouse/pull/42796) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add `FailedAsyncInsertQuery` event metric for async inserts. [#42814](https://github.com/ClickHouse/ClickHouse/pull/42814) ([Krzysztof Góralski](https://github.com/kgoralski)). -* Implement `read-in-order` optimization on top of query plan. It is enabled by default. Set `query_plan_read_in_order = 0` to use previous AST-based version. [#42829](https://github.com/ClickHouse/ClickHouse/pull/42829) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Increase the size of upload part exponentially for backup to S3 to avoid errors about max 10 000 parts limit of the multipart upload to s3. [#42833](https://github.com/ClickHouse/ClickHouse/pull/42833) ([Vitaly Baranov](https://github.com/vitlibar)). -* When the merge task is continuously busy and the disk space is insufficient, the completely expired parts cannot be selected and dropped, resulting in insufficient disk space. My idea is that when the entire Part expires, there is no need for additional disk space to guarantee, ensure the normal execution of TTL. [#42869](https://github.com/ClickHouse/ClickHouse/pull/42869) ([zhongyuankai](https://github.com/zhongyuankai)). -* Add `oss` function and `OSS` table engine (this is convenient for users). oss is fully compatible with s3. [#43155](https://github.com/ClickHouse/ClickHouse/pull/43155) ([zzsmdfj](https://github.com/zzsmdfj)). -* Improve error reporting in the collection of OS-related info for the `system.asynchronous_metrics` table. [#43192](https://github.com/ClickHouse/ClickHouse/pull/43192) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Modify the `INFORMATION_SCHEMA` tables in a way so that ClickHouse can connect to itself using the MySQL compatibility protocol. Add columns instead of aliases (related to [#9769](https://github.com/ClickHouse/ClickHouse/issues/9769)). It will improve the compatibility with various MySQL clients. [#43198](https://github.com/ClickHouse/ClickHouse/pull/43198) ([Filatenkov Artur](https://github.com/FArthur-cmd)). -* Add some functions for compatibility with PowerBI, when it connects using MySQL protocol [#42612](https://github.com/ClickHouse/ClickHouse/pull/42612) ([Filatenkov Artur](https://github.com/FArthur-cmd)). -* Better usability for Dashboard on changes [#42872](https://github.com/ClickHouse/ClickHouse/pull/42872) ([Vladimir C](https://github.com/vdimir)). - -#### Build/Testing/Packaging Improvement -* Run SQLancer for each pull request and commit to master. [SQLancer](https://github.com/sqlancer/sqlancer) is an OpenSource fuzzer that focuses on automatic detection of logical bugs. [#42397](https://github.com/ClickHouse/ClickHouse/pull/42397) ([Ilya Yatsishin](https://github.com/qoega)). -* Update to latest zlib-ng. [#42463](https://github.com/ClickHouse/ClickHouse/pull/42463) ([Boris Kuschel](https://github.com/bkuschel)). -* Add support for testing ClickHouse server with Jepsen. By the way, we already have support for testing ClickHouse Keeper with Jepsen. This pull request extends it to Replicated tables. [#42619](https://github.com/ClickHouse/ClickHouse/pull/42619) ([Antonio Andelic](https://github.com/antonio2368)). -* Use https://github.com/matus-chochlik/ctcache for clang-tidy results caching. [#42913](https://github.com/ClickHouse/ClickHouse/pull/42913) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Before the fix, the user-defined config was preserved by RPM in `$file.rpmsave`. The PR fixes it and won't replace the user's files from packages. [#42936](https://github.com/ClickHouse/ClickHouse/pull/42936) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Remove some libraries from Ubuntu Docker image. [#42622](https://github.com/ClickHouse/ClickHouse/pull/42622) ([Alexey Milovidov](https://github.com/alexey-milovidov)). - -#### Bug Fix (user-visible misbehavior in official stable or prestable release) - -* Updated normaliser to clone the alias ast. Resolves [#42452](https://github.com/ClickHouse/ClickHouse/issues/42452) Implementation: * Updated QueryNormalizer to clone alias ast, when its replaced. Previously just assigning the same leads to exception in LogicalExpressinsOptimizer as it would be the same parent being inserted again. * This bug is not seen with new analyser (allow_experimental_analyzer), so no changes for it. I added a test for the same. [#42827](https://github.com/ClickHouse/ClickHouse/pull/42827) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). -* Fix race for backup of tables in `Lazy` databases. [#43104](https://github.com/ClickHouse/ClickHouse/pull/43104) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix for `skip_unavailable_shards`: it did not work with the `s3Cluster` table function. [#43131](https://github.com/ClickHouse/ClickHouse/pull/43131) ([chen](https://github.com/xiedeyantu)). -* Fix schema inference in `s3Cluster` and improvement in `hdfsCluster`. [#41979](https://github.com/ClickHouse/ClickHouse/pull/41979) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix retries while reading from URL table engines / table function. (retriable errors could be retries more times than needed, non-retriable errors resulted in failed assertion in code). [#42224](https://github.com/ClickHouse/ClickHouse/pull/42224) ([Kseniia Sumarokova](https://github.com/kssenii)). -* A segmentation fault related to DNS & c-ares has been reported and fixed. [#42234](https://github.com/ClickHouse/ClickHouse/pull/42234) ([Arthur Passos](https://github.com/arthurpassos)). -* Fix `LOGICAL_ERROR` `Arguments of 'plus' have incorrect data types` which may happen in PK analysis (monotonicity check). Fix invalid PK analysis for monotonic binary functions with first constant argument. [#42410](https://github.com/ClickHouse/ClickHouse/pull/42410) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix incorrect key analysis when key types cannot be inside Nullable. This fixes [#42456](https://github.com/ClickHouse/ClickHouse/issues/42456). [#42469](https://github.com/ClickHouse/ClickHouse/pull/42469) ([Amos Bird](https://github.com/amosbird)). -* Fix typo in a setting name that led to bad usage of schema inference cache while using setting `input_format_csv_use_best_effort_in_schema_inference`. Closes [#41735](https://github.com/ClickHouse/ClickHouse/issues/41735). [#42536](https://github.com/ClickHouse/ClickHouse/pull/42536) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix creating a Set with wrong header when data type is LowCardinality. Closes [#42460](https://github.com/ClickHouse/ClickHouse/issues/42460). [#42579](https://github.com/ClickHouse/ClickHouse/pull/42579) ([flynn](https://github.com/ucasfl)). -* `(U)Int128` and `(U)Int256` values were correctly checked in `PREWHERE`. [#42605](https://github.com/ClickHouse/ClickHouse/pull/42605) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix a bug in functions parser that could have led to a segmentation fault. [#42724](https://github.com/ClickHouse/ClickHouse/pull/42724) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix the locking in `truncate table`. [#42728](https://github.com/ClickHouse/ClickHouse/pull/42728) ([flynn](https://github.com/ucasfl)). -* Fix possible crash in `web` disks when file does not exist (or `OPTIMIZE TABLE FINAL`, that also can got the same error eventually). [#42767](https://github.com/ClickHouse/ClickHouse/pull/42767) ([Azat Khuzhin](https://github.com/azat)). -* Fix `auth_type` mapping in `system.session_log`, by including `SSL_CERTIFICATE` for the enum values. [#42782](https://github.com/ClickHouse/ClickHouse/pull/42782) ([Miel Donkers](https://github.com/mdonkers)). -* Fix stack-use-after-return under ASAN build in the Create User query parser. [#42804](https://github.com/ClickHouse/ClickHouse/pull/42804) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix `lowerUTF8`/`upperUTF8` in case of symbol was in between 16-byte boundary (very frequent case of you have strings > 16 bytes long). [#42812](https://github.com/ClickHouse/ClickHouse/pull/42812) ([Azat Khuzhin](https://github.com/azat)). -* Additional bound check was added to LZ4 decompression routine to fix misbehaviour in case of malformed input. [#42868](https://github.com/ClickHouse/ClickHouse/pull/42868) ([Nikita Taranov](https://github.com/nickitat)). -* Fix rare possible hang on query cancellation. [#42874](https://github.com/ClickHouse/ClickHouse/pull/42874) ([Azat Khuzhin](https://github.com/azat)). -* Fix incorrect behavior with multiple disjuncts in hash join, close [#42832](https://github.com/ClickHouse/ClickHouse/issues/42832). [#42876](https://github.com/ClickHouse/ClickHouse/pull/42876) ([Vladimir C](https://github.com/vdimir)). -* A null pointer will be generated when select if as from ‘three table join’ , For example, this SQL query: [#42883](https://github.com/ClickHouse/ClickHouse/pull/42883) ([zzsmdfj](https://github.com/zzsmdfj)). -* Fix memory sanitizer report in Cluster Discovery, close [#42763](https://github.com/ClickHouse/ClickHouse/issues/42763). [#42905](https://github.com/ClickHouse/ClickHouse/pull/42905) ([Vladimir C](https://github.com/vdimir)). -* Improve DateTime schema inference in case of empty string. [#42911](https://github.com/ClickHouse/ClickHouse/pull/42911) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix rare NOT_FOUND_COLUMN_IN_BLOCK error when projection is possible to use but there is no projection available. This fixes [#42771](https://github.com/ClickHouse/ClickHouse/issues/42771) . The bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/25563. [#42938](https://github.com/ClickHouse/ClickHouse/pull/42938) ([Amos Bird](https://github.com/amosbird)). -* Fix ATTACH TABLE in `PostgreSQL` database engine if the table contains DATETIME data type. Closes [#42817](https://github.com/ClickHouse/ClickHouse/issues/42817). [#42960](https://github.com/ClickHouse/ClickHouse/pull/42960) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix lambda parsing. Closes [#41848](https://github.com/ClickHouse/ClickHouse/issues/41848). [#42979](https://github.com/ClickHouse/ClickHouse/pull/42979) ([Nikolay Degterinsky](https://github.com/evillique)). -* Fix incorrect key analysis when nullable keys appear in the middle of a hyperrectangle. This fixes [#43111](https://github.com/ClickHouse/ClickHouse/issues/43111) . [#43133](https://github.com/ClickHouse/ClickHouse/pull/43133) ([Amos Bird](https://github.com/amosbird)). -* Fix several buffer over-reads in deserialization of carefully crafted aggregate function states. [#43159](https://github.com/ClickHouse/ClickHouse/pull/43159) ([Raúl Marín](https://github.com/Algunenano)). -* Fix function `if` in case of NULL and const Nullable arguments. Closes [#43069](https://github.com/ClickHouse/ClickHouse/issues/43069). [#43178](https://github.com/ClickHouse/ClickHouse/pull/43178) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix decimal math overflow in parsing DateTime with the 'best effort' algorithm. Closes [#43061](https://github.com/ClickHouse/ClickHouse/issues/43061). [#43180](https://github.com/ClickHouse/ClickHouse/pull/43180) ([Kruglov Pavel](https://github.com/Avogar)). -* The `indent` field produced by the `git-import` tool was miscalculated. See https://clickhouse.com/docs/en/getting-started/example-datasets/github/. [#43191](https://github.com/ClickHouse/ClickHouse/pull/43191) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fixed unexpected behaviour of `Interval` types with subquery and casting. [#43193](https://github.com/ClickHouse/ClickHouse/pull/43193) ([jh0x](https://github.com/jh0x)). - ### ClickHouse release 22.10, 2022-10-26 #### Backward Incompatible Change @@ -673,7 +570,7 @@ * Support SQL standard CREATE INDEX and DROP INDEX syntax. [#35166](https://github.com/ClickHouse/ClickHouse/pull/35166) ([Jianmei Zhang](https://github.com/zhangjmruc)). * Send profile events for INSERT queries (previously only SELECT was supported). [#37391](https://github.com/ClickHouse/ClickHouse/pull/37391) ([Azat Khuzhin](https://github.com/azat)). * Implement in order aggregation (`optimize_aggregation_in_order`) for fully materialized projections. [#37469](https://github.com/ClickHouse/ClickHouse/pull/37469) ([Azat Khuzhin](https://github.com/azat)). -* Remove subprocess run for Kerberos initialization. Added new integration test. Closes [#27651](https://github.com/ClickHouse/ClickHouse/issues/27651). [#38105](https://github.com/ClickHouse/ClickHouse/pull/38105) ([Roman Vasin](https://github.com/rvasin)). +* Remove subprocess run for kerberos initialization. Added new integration test. Closes [#27651](https://github.com/ClickHouse/ClickHouse/issues/27651). [#38105](https://github.com/ClickHouse/ClickHouse/pull/38105) ([Roman Vasin](https://github.com/rvasin)). * * Add setting `multiple_joins_try_to_keep_original_names` to not rewrite identifier name on multiple JOINs rewrite, close [#34697](https://github.com/ClickHouse/ClickHouse/issues/34697). [#38149](https://github.com/ClickHouse/ClickHouse/pull/38149) ([Vladimir C](https://github.com/vdimir)). * Improved trace-visualizer UX. [#38169](https://github.com/ClickHouse/ClickHouse/pull/38169) ([Sergei Trifonov](https://github.com/serxa)). * Enable stack trace collection and query profiler for AArch64. [#38181](https://github.com/ClickHouse/ClickHouse/pull/38181) ([Maksim Kita](https://github.com/kitaisreal)). @@ -953,8 +850,8 @@ #### Upgrade Notes -* Now, background merges, mutations, and `OPTIMIZE` will not increment `SelectedRows` and `SelectedBytes` metrics. They (still) will increment `MergedRows` and `MergedUncompressedBytes` as it was before. This only affects the metric values and makes them better. This change does not introduce any incompatibility, but you may wonder about the changes to the metrics, so we put in this category. [#37040](https://github.com/ClickHouse/ClickHouse/pull/37040) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Updated the BoringSSL module to the official FIPS compliant version. This makes ClickHouse FIPS compliant in this area. [#35914](https://github.com/ClickHouse/ClickHouse/pull/35914) ([Meena-Renganathan](https://github.com/Meena-Renganathan)). The ciphers `aes-192-cfb128` and `aes-256-cfb128` were removed, because they are not included in the FIPS certified version of BoringSSL. +* Now, background merges, mutations and `OPTIMIZE` will not increment `SelectedRows` and `SelectedBytes` metrics. They (still) will increment `MergedRows` and `MergedUncompressedBytes` as it was before. This only affects the metric values, and makes them better. This change does not introduce any incompatibility, but you may wonder about the changes of metrics, so we put in this category. [#37040](https://github.com/ClickHouse/ClickHouse/pull/37040) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Updated the BoringSSL module to the official FIPS compliant version. This makes ClickHouse FIPS compliant. [#35914](https://github.com/ClickHouse/ClickHouse/pull/35914) ([Meena-Renganathan](https://github.com/Meena-Renganathan)). The ciphers `aes-192-cfb128` and `aes-256-cfb128` were removed, because they are not included in the FIPS certified version of BoringSSL. * `max_memory_usage` setting is removed from the default user profile in `users.xml`. This enables flexible memory limits for queries instead of the old rigid limit of 10 GB. * Disable `log_query_threads` setting by default. It controls the logging of statistics about every thread participating in query execution. After supporting asynchronous reads, the total number of distinct thread ids became too large, and logging into the `query_thread_log` has become too heavy. [#37077](https://github.com/ClickHouse/ClickHouse/pull/37077) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Remove function `groupArraySorted` which has a bug. [#36822](https://github.com/ClickHouse/ClickHouse/pull/36822) ([Alexey Milovidov](https://github.com/alexey-milovidov)). From 9011a18234f26914104a3762e3e7e8a2e5f99661 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 16 Nov 2022 22:33:48 +0300 Subject: [PATCH 61/78] Revert "S3 request per second rate throttling" --- .../engines/table-engines/integrations/s3.md | 5 -- .../mergetree-family/mergetree.md | 4 -- src/Backups/BackupIO_S3.cpp | 27 +++++---- src/Backups/BackupIO_S3.h | 5 +- src/Common/ProfileEvents.cpp | 2 +- src/Common/Throttler.cpp | 2 + src/Common/Throttler.h | 2 - src/Coordination/KeeperSnapshotManagerS3.cpp | 12 ++-- src/Core/Settings.h | 4 -- .../ObjectStorages/S3/S3ObjectStorage.cpp | 23 ++++---- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 6 +- src/Disks/ObjectStorages/S3/diskSettings.cpp | 38 ++++--------- src/Disks/ObjectStorages/S3/diskSettings.h | 2 +- src/Disks/ObjectStorages/S3/parseConfig.h | 27 +++++++++ .../ObjectStorages/S3/registerDiskS3.cpp | 13 +++-- src/IO/ReadBufferFromS3.cpp | 10 ++-- src/IO/ReadBufferFromS3.h | 11 ++-- src/IO/S3/PocoHTTPClient.cpp | 26 +-------- src/IO/S3/PocoHTTPClient.h | 17 +----- src/IO/S3/tests/gtest_aws_s3_client.cpp | 8 +-- src/IO/S3Common.cpp | 31 ++-------- src/IO/S3Common.h | 5 +- src/IO/WriteBufferFromS3.cpp | 20 +++---- src/IO/WriteBufferFromS3.h | 4 +- .../ExternalDataSourceConfiguration.h | 2 +- src/Storages/StorageDelta.cpp | 6 +- src/Storages/StorageHudi.cpp | 2 +- src/Storages/StorageS3.cpp | 56 +++++++++---------- src/Storages/StorageS3.h | 13 ++--- src/Storages/StorageS3Cluster.cpp | 2 +- src/Storages/StorageS3Settings.cpp | 47 +++++----------- src/Storages/StorageS3Settings.h | 19 +++---- .../02477_s3_request_throttler.reference | 2 - .../02477_s3_request_throttler.sql | 26 --------- 34 files changed, 173 insertions(+), 306 deletions(-) create mode 100644 src/Disks/ObjectStorages/S3/parseConfig.h delete mode 100644 tests/queries/0_stateless/02477_s3_request_throttler.reference delete mode 100644 tests/queries/0_stateless/02477_s3_request_throttler.sql diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index 484fd265c3d..db983ab9c68 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -127,10 +127,6 @@ The following settings can be set before query execution or placed into configur - `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `512Mb`. - `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`. - `s3_single_read_retries` — The maximum number of attempts during single read. Default value is `4`. -- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited). -- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`. -- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited). -- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration. @@ -146,7 +142,6 @@ The following settings can be specified in configuration file for given endpoint - `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be specified multiple times. - `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. Optional. - `max_single_read_retries` — The maximum number of attempts during single read. Default value is `4`. Optional. -- `max_put_rps`, `max_put_burst`, `max_get_rps` and `max_get_burst` - Throttling settings (see description above) to use for specific endpoint instead of per query. Optional. **Example:** diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 7614a09c018..ce6cec079a3 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -940,10 +940,6 @@ Optional parameters: - `cache_path` — Path on local FS where to store cached mark and index files. Default value is `/var/lib/clickhouse/disks//cache/`. - `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`. - `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. -- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited). -- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`. -- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited). -- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. S3 disk can be configured as `main` or `cold` storage: ``` xml diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 8342749e230..f7f7643a6e3 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -46,7 +46,7 @@ namespace context->getRemoteHostFilter(), static_cast(context->getGlobalContext()->getSettingsRef().s3_max_redirects), context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging, - /* for_disk_s3 = */ false, /* get_request_throttler = */ {}, /* put_request_throttler = */ {}); + /* for_disk_s3 = */ false); client_configuration.endpointOverride = s3_uri.endpoint; client_configuration.maxConnections = static_cast(context->getSettingsRef().s3_max_connections); @@ -86,10 +86,9 @@ BackupReaderS3::BackupReaderS3( const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_) : s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) + , max_single_read_retries(context_->getSettingsRef().s3_max_single_read_retries) , read_settings(context_->getReadSettings()) - , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) { - request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint } DataSourceDescription BackupReaderS3::getDataSourceDescription() const @@ -116,7 +115,7 @@ UInt64 BackupReaderS3::getFileSize(const String & file_name) std::unique_ptr BackupReaderS3::readFile(const String & file_name) { return std::make_unique( - client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, request_settings, read_settings); + client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, max_single_read_retries, read_settings); } @@ -124,12 +123,12 @@ BackupWriterS3::BackupWriterS3( const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_) : s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) + , max_single_read_retries(context_->getSettingsRef().s3_max_single_read_retries) , read_settings(context_->getReadSettings()) - , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) + , rw_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).rw_settings) , log(&Poco::Logger::get("BackupWriterS3")) { - request_settings.updateFromSettingsIfEmpty(context_->getSettingsRef()); - request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint + rw_settings.updateFromSettingsIfEmpty(context_->getSettingsRef()); } DataSourceDescription BackupWriterS3::getDataSourceDescription() const @@ -217,7 +216,7 @@ void BackupWriterS3::copyObjectMultipartImpl( std::vector part_tags; size_t position = 0; - size_t upload_part_size = request_settings.min_upload_part_size; + size_t upload_part_size = rw_settings.min_upload_part_size; for (size_t part_number = 1; position < size; ++part_number) { @@ -249,10 +248,10 @@ void BackupWriterS3::copyObjectMultipartImpl( position = next_position; - if (part_number % request_settings.upload_part_size_multiply_parts_count_threshold == 0) + if (part_number % rw_settings.upload_part_size_multiply_parts_count_threshold == 0) { - upload_part_size *= request_settings.upload_part_size_multiply_factor; - upload_part_size = std::min(upload_part_size, request_settings.max_upload_part_size); + upload_part_size *= rw_settings.upload_part_size_multiply_factor; + upload_part_size = std::min(upload_part_size, rw_settings.max_upload_part_size); } } @@ -295,7 +294,7 @@ void BackupWriterS3::copyFileNative(DiskPtr from_disk, const String & file_name_ auto file_path = fs::path(s3_uri.key) / file_name_to; auto head = requestObjectHeadData(source_bucket, objects[0].absolute_path).GetResult(); - if (static_cast(head.GetContentLength()) < request_settings.max_single_operation_copy_size) + if (static_cast(head.GetContentLength()) < rw_settings.max_single_operation_copy_size) { copyObjectImpl( source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, head); @@ -332,7 +331,7 @@ bool BackupWriterS3::fileContentsEqual(const String & file_name, const String & try { auto in = std::make_unique( - client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, request_settings, read_settings); + client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, max_single_read_retries, read_settings); String actual_file_contents(expected_file_contents.size(), ' '); return (in->read(actual_file_contents.data(), actual_file_contents.size()) == actual_file_contents.size()) && (actual_file_contents == expected_file_contents) && in->eof(); @@ -350,7 +349,7 @@ std::unique_ptr BackupWriterS3::writeFile(const String & file_name) client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, - request_settings, + rw_settings, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, threadPoolCallbackRunner(IOThreadPool::get(), "BackupWriterS3")); diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index deaf7979ff8..b52de23e262 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -39,8 +39,8 @@ public: private: S3::URI s3_uri; std::shared_ptr client; + UInt64 max_single_read_retries; ReadSettings read_settings; - S3Settings::RequestSettings request_settings; }; @@ -81,8 +81,9 @@ private: S3::URI s3_uri; std::shared_ptr client; + UInt64 max_single_read_retries; ReadSettings read_settings; - S3Settings::RequestSettings request_settings; + S3Settings::ReadWriteSettings rw_settings; Poco::Logger * log; }; diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 90d24ec027e..e30a6bb6aaf 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -62,7 +62,7 @@ M(NetworkSendElapsedMicroseconds, "Total time spent waiting for data to send to network or sending data to network. Only ClickHouse-related network interaction is included, not by 3rd party libraries..") \ M(NetworkReceiveBytes, "Total number of bytes received from network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \ M(NetworkSendBytes, "Total number of bytes send to network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \ - M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_network_bandwidth' and other throttling settings.") \ + M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform the 'max_network_bandwidth' setting.") \ \ M(QueryMaskingRulesMatch, "Number of times query masking rules was successfully matched.") \ \ diff --git a/src/Common/Throttler.cpp b/src/Common/Throttler.cpp index 79625d4894c..b38777efc03 100644 --- a/src/Common/Throttler.cpp +++ b/src/Common/Throttler.cpp @@ -20,6 +20,8 @@ namespace ErrorCodes /// Just 10^9. static constexpr auto NS = 1000000000UL; +static const size_t default_burst_seconds = 1; + Throttler::Throttler(size_t max_speed_, const std::shared_ptr & parent_) : max_speed(max_speed_) , max_burst(max_speed_ * default_burst_seconds) diff --git a/src/Common/Throttler.h b/src/Common/Throttler.h index 708e9b939fa..9b6eff13506 100644 --- a/src/Common/Throttler.h +++ b/src/Common/Throttler.h @@ -17,8 +17,6 @@ namespace DB class Throttler { public: - static const size_t default_burst_seconds = 1; - Throttler(size_t max_speed_, size_t max_burst_, const std::shared_ptr & parent_ = nullptr) : max_speed(max_speed_), max_burst(max_burst_), limit_exceeded_exception_message(""), tokens(max_burst), parent(parent_) {} diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index cc837f5f496..2e19d496407 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -93,7 +93,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo auth_settings.region, RemoteHostFilter(), s3_max_redirects, enable_s3_requests_logging, - /* for_disk_s3 = */ false, /* get_request_throttler = */ {}, /* put_request_throttler = */ {}); + /* for_disk_s3 = */ false); client_configuration.endpointOverride = new_uri.endpoint; @@ -135,8 +135,8 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa if (s3_client == nullptr) return; - S3Settings::RequestSettings request_settings_1; - request_settings_1.upload_part_size_multiply_parts_count_threshold = 10000; + S3Settings::ReadWriteSettings read_write_settings; + read_write_settings.upload_part_size_multiply_parts_count_threshold = 10000; const auto create_writer = [&](const auto & key) { @@ -145,7 +145,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa s3_client->client, s3_client->uri.bucket, key, - request_settings_1 + read_write_settings }; }; @@ -194,15 +194,13 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa lock_writer.finalize(); // We read back the written UUID, if it's the same we can upload the file - S3Settings::RequestSettings request_settings_2; - request_settings_2.max_single_read_retries = 1; ReadBufferFromS3 lock_reader { s3_client->client, s3_client->uri.bucket, lock_file, "", - request_settings_2, + 1, {} }; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index bcaea96512d..83252b6f0a9 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -90,10 +90,6 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ - M(UInt64, s3_max_get_rps, 0, "Limit on S3 GET request per second rate before throttling. Zero means unlimited.", 0) \ - M(UInt64, s3_max_get_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_get_rps`", 0) \ - M(UInt64, s3_max_put_rps, 0, "Limit on S3 PUT request per second rate before throttling. Zero means unlimited.", 0) \ - M(UInt64, s3_max_put_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_put_rps`", 0) \ M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \ M(Bool, s3_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in s3 engine tables", 0) \ M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \ diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 11c9345d4a1..099a7d458d0 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -175,7 +175,7 @@ std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT bucket, path, version_id, - settings_ptr->request_settings, + settings_ptr->s3_settings.max_single_read_retries, disk_read_settings, /* use_external_buffer */true, /* offset */0, @@ -212,7 +212,7 @@ std::unique_ptr S3ObjectStorage::readObject( /// NOLINT bucket, object.absolute_path, version_id, - settings_ptr->request_settings, + settings_ptr->s3_settings.max_single_read_retries, patchSettings(read_settings)); } @@ -238,7 +238,7 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN client.get(), bucket, object.absolute_path, - settings_ptr->request_settings, + settings_ptr->s3_settings, attributes, buf_size, std::move(scheduler), @@ -489,7 +489,7 @@ void S3ObjectStorage::copyObjectImpl( throwIfError(outcome); auto settings_ptr = s3_settings.get(); - if (settings_ptr->request_settings.check_objects_after_upload) + if (settings_ptr->s3_settings.check_objects_after_upload) { auto object_head = requestObjectHeadData(dst_bucket, dst_key); if (!object_head.IsSuccess()) @@ -533,7 +533,7 @@ void S3ObjectStorage::copyObjectMultipartImpl( std::vector part_tags; - size_t upload_part_size = settings_ptr->request_settings.min_upload_part_size; + size_t upload_part_size = settings_ptr->s3_settings.min_upload_part_size; for (size_t position = 0, part_number = 1; position < size; ++part_number, position += upload_part_size) { ProfileEvents::increment(ProfileEvents::S3UploadPartCopy); @@ -586,7 +586,7 @@ void S3ObjectStorage::copyObjectMultipartImpl( throwIfError(outcome); } - if (settings_ptr->request_settings.check_objects_after_upload) + if (settings_ptr->s3_settings.check_objects_after_upload) { auto object_head = requestObjectHeadData(dst_bucket, dst_key); if (!object_head.IsSuccess()) @@ -643,20 +643,17 @@ void S3ObjectStorage::startup() void S3ObjectStorage::applyNewSettings(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, ContextPtr context) { - auto new_s3_settings = getSettings(config, config_prefix, context); - auto new_client = getClient(config, config_prefix, context, *new_s3_settings); - s3_settings.set(std::move(new_s3_settings)); - client.set(std::move(new_client)); + s3_settings.set(getSettings(config, config_prefix, context)); + client.set(getClient(config, config_prefix, context)); applyRemoteThrottlingSettings(context); } std::unique_ptr S3ObjectStorage::cloneObjectStorage( const std::string & new_namespace, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, ContextPtr context) { - auto new_s3_settings = getSettings(config, config_prefix, context); - auto new_client = getClient(config, config_prefix, context, *new_s3_settings); return std::make_unique( - std::move(new_client), std::move(new_s3_settings), + getClient(config, config_prefix, context), + getSettings(config, config_prefix, context), version_id, s3_capabilities, new_namespace, S3::URI(Poco::URI(config.getString(config_prefix + ".endpoint"))).endpoint); } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 57ceaa679e0..447ca034aac 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -23,17 +23,17 @@ struct S3ObjectStorageSettings S3ObjectStorageSettings() = default; S3ObjectStorageSettings( - const S3Settings::RequestSettings & request_settings_, + const S3Settings::ReadWriteSettings & s3_settings_, uint64_t min_bytes_for_seek_, int32_t list_object_keys_size_, int32_t objects_chunk_size_to_delete_) - : request_settings(request_settings_) + : s3_settings(s3_settings_) , min_bytes_for_seek(min_bytes_for_seek_) , list_object_keys_size(list_object_keys_size_) , objects_chunk_size_to_delete(objects_chunk_size_to_delete_) {} - S3Settings::RequestSettings request_settings; + S3Settings::ReadWriteSettings s3_settings; uint64_t min_bytes_for_seek; int32_t list_object_keys_size; diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index ca2e9d04926..e61987163d2 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -33,26 +32,17 @@ namespace ErrorCodes std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { - const Settings & settings = context->getSettingsRef(); - S3Settings::RequestSettings request_settings; - request_settings.max_single_read_retries = config.getUInt64(config_prefix + ".s3_max_single_read_retries", settings.s3_max_single_read_retries); - request_settings.min_upload_part_size = config.getUInt64(config_prefix + ".s3_min_upload_part_size", settings.s3_min_upload_part_size); - request_settings.upload_part_size_multiply_factor = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_factor", settings.s3_upload_part_size_multiply_factor); - request_settings.upload_part_size_multiply_parts_count_threshold = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_parts_count_threshold", settings.s3_upload_part_size_multiply_parts_count_threshold); - request_settings.max_single_part_upload_size = config.getUInt64(config_prefix + ".s3_max_single_part_upload_size", settings.s3_max_single_part_upload_size); - request_settings.check_objects_after_upload = config.getUInt64(config_prefix + ".s3_check_objects_after_upload", settings.s3_check_objects_after_upload); - request_settings.max_unexpected_write_error_retries = config.getUInt64(config_prefix + ".s3_max_unexpected_write_error_retries", settings.s3_max_unexpected_write_error_retries); - - // NOTE: it would be better to reuse old throttlers to avoid losing token bucket state on every config reload, which could lead to exceeding limit for short time. But it is good enough unless very high `burst` values are used. - if (UInt64 max_get_rps = config.getUInt64(config_prefix + ".s3_max_get_rps", settings.s3_max_get_rps)) - request_settings.get_request_throttler = std::make_shared( - max_get_rps, config.getUInt64(config_prefix + ".s3_max_get_burst", settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * max_get_rps)); - if (UInt64 max_put_rps = config.getUInt64(config_prefix + ".s3_max_put_rps", settings.s3_max_put_rps)) - request_settings.put_request_throttler = std::make_shared( - max_put_rps, config.getUInt64(config_prefix + ".s3_max_put_burst", settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * max_put_rps)); + S3Settings::ReadWriteSettings rw_settings; + rw_settings.max_single_read_retries = config.getUInt64(config_prefix + ".s3_max_single_read_retries", context->getSettingsRef().s3_max_single_read_retries); + rw_settings.min_upload_part_size = config.getUInt64(config_prefix + ".s3_min_upload_part_size", context->getSettingsRef().s3_min_upload_part_size); + rw_settings.upload_part_size_multiply_factor = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_factor", context->getSettingsRef().s3_upload_part_size_multiply_factor); + rw_settings.upload_part_size_multiply_parts_count_threshold = config.getUInt64(config_prefix + ".s3_upload_part_size_multiply_parts_count_threshold", context->getSettingsRef().s3_upload_part_size_multiply_parts_count_threshold); + rw_settings.max_single_part_upload_size = config.getUInt64(config_prefix + ".s3_max_single_part_upload_size", context->getSettingsRef().s3_max_single_part_upload_size); + rw_settings.check_objects_after_upload = config.getUInt64(config_prefix + ".s3_check_objects_after_upload", context->getSettingsRef().s3_check_objects_after_upload); + rw_settings.max_unexpected_write_error_retries = config.getUInt64(config_prefix + ".s3_max_unexpected_write_error_retries", context->getSettingsRef().s3_max_unexpected_write_error_retries); return std::make_unique( - request_settings, + rw_settings, config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), config.getInt(config_prefix + ".list_object_keys_size", 1000), config.getInt(config_prefix + ".objects_chunk_size_to_delete", 1000)); @@ -122,20 +112,14 @@ std::shared_ptr getProxyConfiguration(const String & pre } -std::unique_ptr getClient( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - ContextPtr context, - const S3ObjectStorageSettings & settings) +std::unique_ptr getClient(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( config.getString(config_prefix + ".region", ""), context->getRemoteHostFilter(), static_cast(context->getGlobalContext()->getSettingsRef().s3_max_redirects), context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging, - /* for_disk_s3 = */ true, - settings.request_settings.get_request_throttler, - settings.request_settings.put_request_throttler); + /* for_disk_s3 = */ true); S3::URI uri(Poco::URI(config.getString(config_prefix + ".endpoint"))); if (uri.key.back() != '/') diff --git a/src/Disks/ObjectStorages/S3/diskSettings.h b/src/Disks/ObjectStorages/S3/diskSettings.h index 04eb7aced8e..05ba8819f83 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.h +++ b/src/Disks/ObjectStorages/S3/diskSettings.h @@ -22,7 +22,7 @@ struct S3ObjectStorageSettings; std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context); -std::unique_ptr getClient(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, const S3ObjectStorageSettings & settings); +std::unique_ptr getClient(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context); } diff --git a/src/Disks/ObjectStorages/S3/parseConfig.h b/src/Disks/ObjectStorages/S3/parseConfig.h new file mode 100644 index 00000000000..1defc673c2e --- /dev/null +++ b/src/Disks/ObjectStorages/S3/parseConfig.h @@ -0,0 +1,27 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + + +std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context); + +std::shared_ptr getClient(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context); + + +} diff --git a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp index e73accbb956..e09aef22122 100644 --- a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp +++ b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp @@ -130,16 +130,21 @@ void registerDiskS3(DiskFactory & factory) chassert(type == "s3" || type == "s3_plain"); MetadataStoragePtr metadata_storage; - auto settings = getSettings(config, config_prefix, context); - auto client = getClient(config, config_prefix, context, *settings); if (type == "s3_plain") { - s3_storage = std::make_shared(std::move(client), std::move(settings), uri.version_id, s3_capabilities, uri.bucket, uri.endpoint); + s3_storage = std::make_shared( + getClient(config, config_prefix, context), + getSettings(config, config_prefix, context), + uri.version_id, s3_capabilities, uri.bucket, uri.endpoint); metadata_storage = std::make_shared(s3_storage, uri.key); } else { - s3_storage = std::make_shared(std::move(client), std::move(settings), uri.version_id, s3_capabilities, uri.bucket, uri.endpoint); + s3_storage = std::make_shared( + getClient(config, config_prefix, context), + getSettings(config, config_prefix, context), + uri.version_id, s3_capabilities, uri.bucket, uri.endpoint); + auto [metadata_path, metadata_disk] = prepareForLocalMetadata(name, config, config_prefix, context); metadata_storage = std::make_shared(metadata_disk, uri.key); } diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index c14fbecf223..c49941b025d 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -45,7 +45,7 @@ ReadBufferFromS3::ReadBufferFromS3( const String & bucket_, const String & key_, const String & version_id_, - const S3Settings::RequestSettings & request_settings_, + UInt64 max_single_read_retries_, const ReadSettings & settings_, bool use_external_buffer_, size_t offset_, @@ -56,7 +56,7 @@ ReadBufferFromS3::ReadBufferFromS3( , bucket(bucket_) , key(key_) , version_id(version_id_) - , request_settings(request_settings_) + , max_single_read_retries(max_single_read_retries_) , offset(offset_) , read_until_position(read_until_position_) , read_settings(settings_) @@ -105,7 +105,7 @@ bool ReadBufferFromS3::nextImpl() } size_t sleep_time_with_backoff_milliseconds = 100; - for (size_t attempt = 0; attempt < request_settings.max_single_read_retries && !next_result; ++attempt) + for (size_t attempt = 0; (attempt < max_single_read_retries) && !next_result; ++attempt) { Stopwatch watch; try @@ -166,7 +166,7 @@ bool ReadBufferFromS3::nextImpl() attempt, e.message()); - if (attempt + 1 == request_settings.max_single_read_retries) + if (attempt + 1 == max_single_read_retries) throw; /// Pause before next attempt. @@ -349,7 +349,7 @@ SeekableReadBufferPtr ReadBufferS3Factory::getReader() bucket, key, version_id, - request_settings, + s3_max_single_read_retries, read_settings, false /*use_external_buffer*/, next_range->first, diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 17b13bf7d62..cc836bba495 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include "config.h" #if USE_AWS_S3 @@ -34,7 +33,7 @@ private: String bucket; String key; String version_id; - const S3Settings::RequestSettings request_settings; + UInt64 max_single_read_retries; /// These variables are atomic because they can be used for `logging only` /// (where it is not important to get consistent result) @@ -53,7 +52,7 @@ public: const String & bucket_, const String & key_, const String & version_id_, - const S3Settings::RequestSettings & request_settings_, + UInt64 max_single_read_retries_, const ReadSettings & settings_, bool use_external_buffer = false, size_t offset_ = 0, @@ -101,7 +100,7 @@ public: const String & version_id_, size_t range_step_, size_t object_size_, - const S3Settings::RequestSettings & request_settings_, + UInt64 s3_max_single_read_retries_, const ReadSettings & read_settings_) : client_ptr(client_ptr_) , bucket(bucket_) @@ -111,7 +110,7 @@ public: , range_generator(object_size_, range_step_) , range_step(range_step_) , object_size(object_size_) - , request_settings(request_settings_) + , s3_max_single_read_retries(s3_max_single_read_retries_) { assert(range_step > 0); assert(range_step < object_size); @@ -136,7 +135,7 @@ private: size_t range_step; size_t object_size; - const S3Settings::RequestSettings request_settings; + UInt64 s3_max_single_read_retries; }; } diff --git a/src/IO/S3/PocoHTTPClient.cpp b/src/IO/S3/PocoHTTPClient.cpp index 33dd3250c9f..7d053bebe4a 100644 --- a/src/IO/S3/PocoHTTPClient.cpp +++ b/src/IO/S3/PocoHTTPClient.cpp @@ -11,7 +11,6 @@ #include #include -#include #include #include #include @@ -77,16 +76,12 @@ PocoHTTPClientConfiguration::PocoHTTPClientConfiguration( const RemoteHostFilter & remote_host_filter_, unsigned int s3_max_redirects_, bool enable_s3_requests_logging_, - bool for_disk_s3_, - const ThrottlerPtr & get_request_throttler_, - const ThrottlerPtr & put_request_throttler_) + bool for_disk_s3_) : force_region(force_region_) , remote_host_filter(remote_host_filter_) , s3_max_redirects(s3_max_redirects_) , enable_s3_requests_logging(enable_s3_requests_logging_) , for_disk_s3(for_disk_s3_) - , get_request_throttler(get_request_throttler_) - , put_request_throttler(put_request_throttler_) { } @@ -133,8 +128,6 @@ PocoHTTPClient::PocoHTTPClient(const PocoHTTPClientConfiguration & client_config , s3_max_redirects(client_configuration.s3_max_redirects) , enable_s3_requests_logging(client_configuration.enable_s3_requests_logging) , for_disk_s3(client_configuration.for_disk_s3) - , get_request_throttler(client_configuration.get_request_throttler) - , put_request_throttler(client_configuration.put_request_throttler) , extra_headers(client_configuration.extra_headers) { } @@ -252,23 +245,6 @@ void PocoHTTPClient::makeRequestInternal( if (enable_s3_requests_logging) LOG_TEST(log, "Make request to: {}", uri); - switch (request.GetMethod()) - { - case Aws::Http::HttpMethod::HTTP_GET: - case Aws::Http::HttpMethod::HTTP_HEAD: - if (get_request_throttler) - get_request_throttler->add(1); - break; - case Aws::Http::HttpMethod::HTTP_PUT: - case Aws::Http::HttpMethod::HTTP_POST: - case Aws::Http::HttpMethod::HTTP_PATCH: - if (put_request_throttler) - put_request_throttler->add(1); - break; - case Aws::Http::HttpMethod::HTTP_DELETE: - break; // Not throttled - } - addMetric(request, S3MetricType::Count); CurrentMetrics::Increment metric_increment{CurrentMetrics::S3Requests}; diff --git a/src/IO/S3/PocoHTTPClient.h b/src/IO/S3/PocoHTTPClient.h index ed6e1793c2c..5649638285d 100644 --- a/src/IO/S3/PocoHTTPClient.h +++ b/src/IO/S3/PocoHTTPClient.h @@ -8,7 +8,6 @@ #if USE_AWS_S3 #include -#include #include #include #include @@ -49,8 +48,6 @@ struct PocoHTTPClientConfiguration : public Aws::Client::ClientConfiguration unsigned int s3_max_redirects; bool enable_s3_requests_logging; bool for_disk_s3; - ThrottlerPtr get_request_throttler; - ThrottlerPtr put_request_throttler; HeaderCollection extra_headers; void updateSchemeAndRegion(); @@ -63,9 +60,7 @@ private: const RemoteHostFilter & remote_host_filter_, unsigned int s3_max_redirects_, bool enable_s3_requests_logging_, - bool for_disk_s3_, - const ThrottlerPtr & get_request_throttler_, - const ThrottlerPtr & put_request_throttler_ + bool for_disk_s3_ ); /// Constructor of Aws::Client::ClientConfiguration must be called after AWS SDK initialization. @@ -159,16 +154,6 @@ private: unsigned int s3_max_redirects; bool enable_s3_requests_logging; bool for_disk_s3; - - /// Limits get request per second rate for GET, SELECT and all other requests, excluding throttled by put throttler - /// (i.e. throttles GetObject, HeadObject) - ThrottlerPtr get_request_throttler; - - /// Limits put request per second rate for PUT, COPY, POST, LIST requests - /// (i.e. throttles PutObject, CopyObject, ListObjects, CreateMultipartUpload, UploadPartCopy, UploadPart, CompleteMultipartUpload) - /// NOTE: DELETE and CANCEL requests are not throttled by either put or get throttler - ThrottlerPtr put_request_throttler; - const HeaderCollection extra_headers; }; diff --git a/src/IO/S3/tests/gtest_aws_s3_client.cpp b/src/IO/S3/tests/gtest_aws_s3_client.cpp index 2d298ca5df2..9b2a65d84fc 100644 --- a/src/IO/S3/tests/gtest_aws_s3_client.cpp +++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp @@ -88,9 +88,7 @@ TEST(IOTestAwsS3Client, AppendExtraSSECHeaders) remote_host_filter, s3_max_redirects, enable_s3_requests_logging, - /* for_disk_s3 = */ false, - /* get_request_throttler = */ {}, - /* put_request_throttler = */ {} + /* for_disk_s3 = */ false ); client_configuration.endpointOverride = uri.endpoint; @@ -115,14 +113,12 @@ TEST(IOTestAwsS3Client, AppendExtraSSECHeaders) ASSERT_TRUE(client); DB::ReadSettings read_settings; - DB::S3Settings::RequestSettings request_settings; - request_settings.max_single_read_retries = max_single_read_retries; DB::ReadBufferFromS3 read_buffer( client, uri.bucket, uri.key, version_id, - request_settings, + max_single_read_retries, read_settings ); diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 5af09275df4..859f5ce796b 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -573,14 +573,7 @@ public: /// AWS API tries credentials providers one by one. Some of providers (like ProfileConfigFileAWSCredentialsProvider) can be /// quite verbose even if nobody configured them. So we use our provider first and only after it use default providers. { - DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration( - configuration.region, - configuration.remote_host_filter, - configuration.s3_max_redirects, - configuration.enable_s3_requests_logging, - configuration.for_disk_s3, - configuration.get_request_throttler, - configuration.put_request_throttler); + DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging, configuration.for_disk_s3); AddProvider(std::make_shared(aws_client_configuration)); } @@ -617,14 +610,7 @@ public: } else if (Aws::Utils::StringUtils::ToLower(ec2_metadata_disabled.c_str()) != "true") { - DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration( - configuration.region, - configuration.remote_host_filter, - configuration.s3_max_redirects, - configuration.enable_s3_requests_logging, - configuration.for_disk_s3, - configuration.get_request_throttler, - configuration.put_request_throttler); + DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging, configuration.for_disk_s3); /// See MakeDefaultHttpResourceClientConfiguration(). /// This is part of EC2 metadata client, but unfortunately it can't be accessed from outside @@ -745,18 +731,9 @@ namespace S3 const RemoteHostFilter & remote_host_filter, unsigned int s3_max_redirects, bool enable_s3_requests_logging, - bool for_disk_s3, - const ThrottlerPtr & get_request_throttler, - const ThrottlerPtr & put_request_throttler) + bool for_disk_s3) { - return PocoHTTPClientConfiguration( - force_region, - remote_host_filter, - s3_max_redirects, - enable_s3_requests_logging, - for_disk_s3, - get_request_throttler, - put_request_throttler); + return PocoHTTPClientConfiguration(force_region, remote_host_filter, s3_max_redirects, enable_s3_requests_logging, for_disk_s3); } URI::URI(const Poco::URI & uri_) diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 01a6b8d5d82..93e5eb78c7f 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -17,7 +17,6 @@ #include #include -#include namespace Aws::S3 { @@ -89,9 +88,7 @@ public: const RemoteHostFilter & remote_host_filter, unsigned int s3_max_redirects, bool enable_s3_requests_logging, - bool for_disk_s3, - const ThrottlerPtr & get_request_throttler, - const ThrottlerPtr & put_request_throttler); + bool for_disk_s3); private: ClientFactory(); diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 56d487f165b..9ed2c41fd01 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -71,7 +71,7 @@ WriteBufferFromS3::WriteBufferFromS3( std::shared_ptr client_ptr_, const String & bucket_, const String & key_, - const S3Settings::RequestSettings & request_settings_, + const S3Settings::ReadWriteSettings & s3_settings_, std::optional> object_metadata_, size_t buffer_size_, ThreadPoolCallbackRunner schedule_, @@ -79,10 +79,10 @@ WriteBufferFromS3::WriteBufferFromS3( : BufferWithOwnMemory(buffer_size_, nullptr, 0) , bucket(bucket_) , key(key_) - , request_settings(request_settings_) + , s3_settings(s3_settings_) , client_ptr(std::move(client_ptr_)) , object_metadata(std::move(object_metadata_)) - , upload_part_size(request_settings_.min_upload_part_size) + , upload_part_size(s3_settings_.min_upload_part_size) , schedule(std::move(schedule_)) , write_settings(write_settings_) { @@ -107,7 +107,7 @@ void WriteBufferFromS3::nextImpl() write_settings.remote_throttler->add(offset()); /// Data size exceeds singlepart upload threshold, need to use multipart upload. - if (multipart_upload_id.empty() && last_part_size > request_settings.max_single_part_upload_size) + if (multipart_upload_id.empty() && last_part_size > s3_settings.max_single_part_upload_size) createMultipartUpload(); if (!multipart_upload_id.empty() && last_part_size > upload_part_size) @@ -122,10 +122,10 @@ void WriteBufferFromS3::nextImpl() void WriteBufferFromS3::allocateBuffer() { - if (total_parts_uploaded != 0 && total_parts_uploaded % request_settings.upload_part_size_multiply_parts_count_threshold == 0) + if (total_parts_uploaded != 0 && total_parts_uploaded % s3_settings.upload_part_size_multiply_parts_count_threshold == 0) { - upload_part_size *= request_settings.upload_part_size_multiply_factor; - upload_part_size = std::min(upload_part_size, request_settings.max_upload_part_size); + upload_part_size *= s3_settings.upload_part_size_multiply_factor; + upload_part_size = std::min(upload_part_size, s3_settings.max_upload_part_size); } temporary_buffer = Aws::MakeShared("temporary buffer"); @@ -180,7 +180,7 @@ void WriteBufferFromS3::finalizeImpl() if (!multipart_upload_id.empty()) completeMultipartUpload(); - if (request_settings.check_objects_after_upload) + if (s3_settings.check_objects_after_upload) { LOG_TRACE(log, "Checking object {} exists after upload", key); @@ -370,7 +370,7 @@ void WriteBufferFromS3::completeMultipartUpload() req.SetMultipartUpload(multipart_upload); - size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retry = std::max(s3_settings.max_unexpected_write_error_retries, 1UL); for (size_t i = 0; i < max_retry; ++i) { ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload); @@ -476,7 +476,7 @@ void WriteBufferFromS3::fillPutRequest(Aws::S3::Model::PutObjectRequest & req) void WriteBufferFromS3::processPutRequest(const PutObjectTask & task) { - size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retry = std::max(s3_settings.max_unexpected_write_error_retries, 1UL); for (size_t i = 0; i < max_retry; ++i) { ProfileEvents::increment(ProfileEvents::S3PutObject); diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index b4b5a6d37a3..28f831856d7 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -50,7 +50,7 @@ public: std::shared_ptr client_ptr_, const String & bucket_, const String & key_, - const S3Settings::RequestSettings & request_settings_, + const S3Settings::ReadWriteSettings & s3_settings_, std::optional> object_metadata_ = std::nullopt, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, ThreadPoolCallbackRunner schedule_ = {}, @@ -88,7 +88,7 @@ private: const String bucket; const String key; - const S3Settings::RequestSettings request_settings; + const S3Settings::ReadWriteSettings s3_settings; const std::shared_ptr client_ptr; const std::optional> object_metadata; diff --git a/src/Storages/ExternalDataSourceConfiguration.h b/src/Storages/ExternalDataSourceConfiguration.h index a8c27e3d1d4..5736336983a 100644 --- a/src/Storages/ExternalDataSourceConfiguration.h +++ b/src/Storages/ExternalDataSourceConfiguration.h @@ -118,7 +118,7 @@ struct URLBasedDataSourceConfiguration struct StorageS3Configuration : URLBasedDataSourceConfiguration { S3::AuthSettings auth_settings; - S3Settings::RequestSettings request_settings; + S3Settings::ReadWriteSettings rw_settings; }; diff --git a/src/Storages/StorageDelta.cpp b/src/Storages/StorageDelta.cpp index c077b24c610..e8287a2fd61 100644 --- a/src/Storages/StorageDelta.cpp +++ b/src/Storages/StorageDelta.cpp @@ -151,14 +151,12 @@ std::vector JsonMetadataGetter::getJsonLogFiles() std::shared_ptr JsonMetadataGetter::createS3ReadBuffer(const String & key, ContextPtr context) { /// TODO: add parallel downloads - S3Settings::RequestSettings request_settings; - request_settings.max_single_read_retries = 10; return std::make_shared( base_configuration.client, base_configuration.uri.bucket, key, base_configuration.uri.version_id, - request_settings, + /* max single read retries */10, context->getReadSettings()); } @@ -189,7 +187,7 @@ StorageDelta::StorageDelta( ContextPtr context_, std::optional format_settings_) : IStorage(table_id_) - , base_configuration{configuration_.url, configuration_.auth_settings, configuration_.request_settings, configuration_.headers} + , base_configuration{configuration_.url, configuration_.auth_settings, configuration_.rw_settings, configuration_.headers} , log(&Poco::Logger::get("StorageDeltaLake (" + table_id_.table_name + ")")) , table_path(base_configuration.uri.key) { diff --git a/src/Storages/StorageHudi.cpp b/src/Storages/StorageHudi.cpp index 4b20e4cbd22..121856c4a57 100644 --- a/src/Storages/StorageHudi.cpp +++ b/src/Storages/StorageHudi.cpp @@ -37,7 +37,7 @@ StorageHudi::StorageHudi( ContextPtr context_, std::optional format_settings_) : IStorage(table_id_) - , base_configuration{configuration_.url, configuration_.auth_settings, configuration_.request_settings, configuration_.headers} + , base_configuration{configuration_.url, configuration_.auth_settings, configuration_.rw_settings, configuration_.headers} , log(&Poco::Logger::get("StorageHudi (" + table_id_.table_name + ")")) , table_path(base_configuration.uri.key) { diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 7b575d0d12f..ab9b71f5ff3 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -100,8 +100,7 @@ public: const Block & virtual_header_, ContextPtr context_, std::unordered_map * object_infos_, - Strings * read_keys_, - const S3Settings::RequestSettings & request_settings_) + Strings * read_keys_) : WithContext(context_) , client(client_) , globbed_uri(globbed_uri_) @@ -109,7 +108,6 @@ public: , virtual_header(virtual_header_) , object_infos(object_infos_) , read_keys(read_keys_) - , request_settings(request_settings_) { if (globbed_uri.bucket.find_first_of("*?{") != globbed_uri.bucket.npos) throw Exception("Expression can not have wildcards inside bucket name", ErrorCodes::UNEXPECTED_EXPRESSION); @@ -260,7 +258,6 @@ private: bool is_finished{false}; std::unordered_map * object_infos; Strings * read_keys; - S3Settings::RequestSettings request_settings; }; StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( @@ -270,9 +267,8 @@ StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( const Block & virtual_header, ContextPtr context, std::unordered_map * object_infos_, - Strings * read_keys_, - const S3Settings::RequestSettings & request_settings_) - : pimpl(std::make_shared(client_, globbed_uri_, query, virtual_header, context, object_infos_, read_keys_, request_settings_)) + Strings * read_keys_) + : pimpl(std::make_shared(client_, globbed_uri_, query, virtual_header, context, object_infos_, read_keys_)) { } @@ -385,7 +381,7 @@ StorageS3Source::StorageS3Source( std::optional format_settings_, const ColumnsDescription & columns_, UInt64 max_block_size_, - const S3Settings::RequestSettings & request_settings_, + UInt64 max_single_read_retries_, String compression_hint_, const std::shared_ptr & client_, const String & bucket_, @@ -401,7 +397,7 @@ StorageS3Source::StorageS3Source( , format(format_) , columns_desc(columns_) , max_block_size(max_block_size_) - , request_settings(request_settings_) + , max_single_read_retries(max_single_read_retries_) , compression_hint(std::move(compression_hint_)) , client(client_) , sample_block(sample_block_) @@ -467,7 +463,7 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k if (!use_parallel_download || object_too_small) { LOG_TRACE(log, "Downloading object of size {} from S3 in single thread", object_size); - return std::make_unique(client, bucket, key, version_id, request_settings, getContext()->getReadSettings()); + return std::make_unique(client, bucket, key, version_id, max_single_read_retries, getContext()->getReadSettings()); } assert(object_size > 0); @@ -479,7 +475,7 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k } auto factory = std::make_unique( - client, bucket, key, version_id, download_buffer_size, object_size, request_settings, getContext()->getReadSettings()); + client, bucket, key, version_id, download_buffer_size, object_size, max_single_read_retries, getContext()->getReadSettings()); LOG_TRACE( log, "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", download_thread_num, object_size, download_buffer_size); @@ -589,7 +585,7 @@ public: s3_configuration_.client, bucket, key, - s3_configuration_.request_settings, + s3_configuration_.rw_settings, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, threadPoolCallbackRunner(IOThreadPool::get(), "S3ParallelRead"), @@ -753,7 +749,7 @@ StorageS3::StorageS3( bool distributed_processing_, ASTPtr partition_by_) : IStorage(table_id_) - , s3_configuration{configuration_.url, configuration_.auth_settings, configuration_.request_settings, configuration_.headers} + , s3_configuration{configuration_.url, configuration_.auth_settings, configuration_.rw_settings, configuration_.headers} , keys({s3_configuration.uri.key}) , format_name(configuration_.format) , compression_method(configuration_.compression_method) @@ -819,7 +815,7 @@ std::shared_ptr StorageS3::createFileIterator( { /// Iterate through disclosed globs and make a source for each file auto glob_iterator = std::make_shared( - *s3_configuration.client, s3_configuration.uri, query, virtual_block, local_context, object_infos, read_keys, s3_configuration.request_settings); + *s3_configuration.client, s3_configuration.uri, query, virtual_block, local_context, object_infos, read_keys); return std::make_shared([glob_iterator]() { return glob_iterator->next(); }); } else @@ -909,7 +905,7 @@ Pipe StorageS3::read( format_settings, columns_description, max_block_size, - s3_configuration.request_settings, + s3_configuration.rw_settings.max_single_read_retries, compression_method, s3_configuration.client, s3_configuration.uri.bucket, @@ -1026,10 +1022,12 @@ void StorageS3::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, void StorageS3::updateS3Configuration(ContextPtr ctx, StorageS3::S3Configuration & upd) { auto settings = ctx->getStorageS3Settings().getSettings(upd.uri.uri.toString()); - if (upd.request_settings != settings.request_settings) - upd.request_settings = settings.request_settings; + const auto & config_rw_settings = settings.rw_settings; - upd.request_settings.updateFromSettingsIfEmpty(ctx->getSettings()); + if (upd.rw_settings != config_rw_settings) + upd.rw_settings = settings.rw_settings; + + upd.rw_settings.updateFromSettingsIfEmpty(ctx->getSettings()); if (upd.client) { @@ -1047,12 +1045,10 @@ void StorageS3::updateS3Configuration(ContextPtr ctx, StorageS3::S3Configuration ctx->getRemoteHostFilter(), static_cast(ctx->getGlobalContext()->getSettingsRef().s3_max_redirects), ctx->getGlobalContext()->getSettingsRef().enable_s3_requests_logging, - /* for_disk_s3 = */ false, - upd.request_settings.get_request_throttler, - upd.request_settings.put_request_throttler); + /* for_disk_s3 = */ false); client_configuration.endpointOverride = upd.uri.endpoint; - client_configuration.maxConnections = static_cast(upd.request_settings.max_connections); + client_configuration.maxConnections = static_cast(upd.rw_settings.max_connections); auto credentials = Aws::Auth::AWSCredentials(upd.auth_settings.access_key_id, upd.auth_settings.secret_access_key); auto headers = upd.auth_settings.headers; @@ -1084,17 +1080,17 @@ void StorageS3::processNamedCollectionResult(StorageS3Configuration & configurat else if (arg_name == "use_environment_credentials") configuration.auth_settings.use_environment_credentials = checkAndGetLiteralArgument(arg_value, "use_environment_credentials"); else if (arg_name == "max_single_read_retries") - configuration.request_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "max_single_read_retries"); + configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "max_single_read_retries"); else if (arg_name == "min_upload_part_size") - configuration.request_settings.min_upload_part_size = checkAndGetLiteralArgument(arg_value, "min_upload_part_size"); + configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "min_upload_part_size"); else if (arg_name == "upload_part_size_multiply_factor") - configuration.request_settings.upload_part_size_multiply_factor = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_factor"); + configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_factor"); else if (arg_name == "upload_part_size_multiply_parts_count_threshold") - configuration.request_settings.upload_part_size_multiply_parts_count_threshold = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_parts_count_threshold"); + configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "upload_part_size_multiply_parts_count_threshold"); else if (arg_name == "max_single_part_upload_size") - configuration.request_settings.max_single_part_upload_size = checkAndGetLiteralArgument(arg_value, "max_single_part_upload_size"); + configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "max_single_part_upload_size"); else if (arg_name == "max_connections") - configuration.request_settings.max_connections = checkAndGetLiteralArgument(arg_value, "max_connections"); + configuration.rw_settings.max_single_read_retries = checkAndGetLiteralArgument(arg_value, "max_connections"); else throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Unknown key-value argument `{}` for StorageS3, expected: url, [access_key_id, secret_access_key], name of used format and [compression_method].", @@ -1170,7 +1166,7 @@ ColumnsDescription StorageS3::getTableStructureFromData( S3Configuration s3_configuration{ configuration.url, configuration.auth_settings, - S3Settings::RequestSettings(ctx->getSettingsRef()), + S3Settings::ReadWriteSettings(ctx->getSettingsRef()), configuration.headers}; updateS3Configuration(ctx, s3_configuration); @@ -1232,7 +1228,7 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( int zstd_window_log_max = static_cast(ctx->getSettingsRef().zstd_window_log_max); return wrapReadBufferWithCompressionMethod( std::make_unique( - s3_configuration.client, s3_configuration.uri.bucket, key, s3_configuration.uri.version_id, s3_configuration.request_settings, ctx->getReadSettings()), + s3_configuration.client, s3_configuration.uri.bucket, key, s3_configuration.uri.version_id, s3_configuration.rw_settings.max_single_read_retries, ctx->getReadSettings()), chooseCompressionMethod(key, compression_method), zstd_window_log_max); }; diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 47ac26abccb..81bbe2c86ae 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -43,8 +43,7 @@ public: const Block & virtual_header, ContextPtr context, std::unordered_map * object_infos = nullptr, - Strings * read_keys_ = nullptr, - const S3Settings::RequestSettings & request_settings_ = {}); + Strings * read_keys_ = nullptr); String next(); @@ -80,7 +79,7 @@ public: std::optional format_settings_, const ColumnsDescription & columns_, UInt64 max_block_size_, - const S3Settings::RequestSettings & request_settings_, + UInt64 max_single_read_retries_, String compression_hint_, const std::shared_ptr & client_, const String & bucket, @@ -103,7 +102,7 @@ private: String format; ColumnsDescription columns_desc; UInt64 max_block_size; - S3Settings::RequestSettings request_settings; + UInt64 max_single_read_retries; String compression_hint; std::shared_ptr client; Block sample_block; @@ -187,7 +186,7 @@ public: std::shared_ptr client; S3::AuthSettings auth_settings; - S3Settings::RequestSettings request_settings; + S3Settings::ReadWriteSettings rw_settings; /// If s3 configuration was passed from ast, then it is static. /// If from config - it can be changed with config reload. @@ -199,11 +198,11 @@ public: S3Configuration( const String & url_, const S3::AuthSettings & auth_settings_, - const S3Settings::RequestSettings & request_settings_, + const S3Settings::ReadWriteSettings & rw_settings_, const HeaderCollection & headers_from_ast_) : uri(S3::URI(url_)) , auth_settings(auth_settings_) - , request_settings(request_settings_) + , rw_settings(rw_settings_) , static_configuration(!auth_settings_.access_key_id.empty()) , headers_from_ast(headers_from_ast_) {} }; diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp index ec970654b6e..350e942f972 100644 --- a/src/Storages/StorageS3Cluster.cpp +++ b/src/Storages/StorageS3Cluster.cpp @@ -46,7 +46,7 @@ StorageS3Cluster::StorageS3Cluster( const ConstraintsDescription & constraints_, ContextPtr context_) : IStorage(table_id_) - , s3_configuration{configuration_.url, configuration_.auth_settings, configuration_.request_settings, configuration_.headers} + , s3_configuration{configuration_.url, configuration_.auth_settings, configuration_.rw_settings, configuration_.headers} , filename(configuration_.url) , cluster_name(configuration_.cluster_name) , format_name(configuration_.format) diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index 962f7c113bd..68e15d10f52 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -58,26 +57,18 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U auto auth_settings = S3::AuthSettings::loadFromConfig(config_elem + "." + key, config); - S3Settings::RequestSettings request_settings; - request_settings.max_single_read_retries = get_uint_for_key(key, "max_single_read_retries", true, settings.s3_max_single_read_retries); - request_settings.min_upload_part_size = get_uint_for_key(key, "min_upload_part_size", true, settings.s3_min_upload_part_size); - request_settings.max_upload_part_size = get_uint_for_key(key, "max_upload_part_size", true, DEFAULT_MAX_UPLOAD_PART_SIZE); - request_settings.upload_part_size_multiply_factor = get_uint_for_key(key, "upload_part_size_multiply_factor", true, settings.s3_upload_part_size_multiply_factor); - request_settings.upload_part_size_multiply_parts_count_threshold = get_uint_for_key(key, "upload_part_size_multiply_parts_count_threshold", true, settings.s3_upload_part_size_multiply_parts_count_threshold); - request_settings.max_single_part_upload_size = get_uint_for_key(key, "max_single_part_upload_size", true, settings.s3_max_single_part_upload_size); - request_settings.max_single_operation_copy_size = get_uint_for_key(key, "max_single_operation_copy_size", true, DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE); - request_settings.max_connections = get_uint_for_key(key, "max_connections", true, settings.s3_max_connections); - request_settings.check_objects_after_upload = get_bool_for_key(key, "check_objects_after_upload", true, false); + S3Settings::ReadWriteSettings rw_settings; + rw_settings.max_single_read_retries = get_uint_for_key(key, "max_single_read_retries", true, settings.s3_max_single_read_retries); + rw_settings.min_upload_part_size = get_uint_for_key(key, "min_upload_part_size", true, settings.s3_min_upload_part_size); + rw_settings.max_upload_part_size = get_uint_for_key(key, "max_upload_part_size", true, DEFAULT_MAX_UPLOAD_PART_SIZE); + rw_settings.upload_part_size_multiply_factor = get_uint_for_key(key, "upload_part_size_multiply_factor", true, settings.s3_upload_part_size_multiply_factor); + rw_settings.upload_part_size_multiply_parts_count_threshold = get_uint_for_key(key, "upload_part_size_multiply_parts_count_threshold", true, settings.s3_upload_part_size_multiply_parts_count_threshold); + rw_settings.max_single_part_upload_size = get_uint_for_key(key, "max_single_part_upload_size", true, settings.s3_max_single_part_upload_size); + rw_settings.max_single_operation_copy_size = get_uint_for_key(key, "max_single_operation_copy_size", true, DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE); + rw_settings.max_connections = get_uint_for_key(key, "max_connections", true, settings.s3_max_connections); + rw_settings.check_objects_after_upload = get_bool_for_key(key, "check_objects_after_upload", true, false); - // NOTE: it would be better to reuse old throttlers to avoid losing token bucket state on every config reload, which could lead to exceeding limit for short time. But it is good enough unless very high `burst` values are used. - if (UInt64 max_get_rps = get_uint_for_key(key, "max_get_rps", true, settings.s3_max_get_rps)) - request_settings.get_request_throttler = std::make_shared( - max_get_rps, get_uint_for_key(key, "max_get_burst", true, settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * max_get_rps)); - if (UInt64 max_put_rps = get_uint_for_key(key, "max_put_rps", true, settings.s3_max_put_rps)) - request_settings.put_request_throttler = std::make_shared( - max_put_rps, get_uint_for_key(key, "max_put_burst", true, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * max_put_rps)); - - s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(request_settings)}); + s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(rw_settings)}); } } } @@ -98,7 +89,7 @@ S3Settings StorageS3Settings::getSettings(const String & endpoint) const return {}; } -S3Settings::RequestSettings::RequestSettings(const Settings & settings) +S3Settings::ReadWriteSettings::ReadWriteSettings(const Settings & settings) { max_single_read_retries = settings.s3_max_single_read_retries; min_upload_part_size = settings.s3_min_upload_part_size; @@ -108,15 +99,9 @@ S3Settings::RequestSettings::RequestSettings(const Settings & settings) max_connections = settings.s3_max_connections; check_objects_after_upload = settings.s3_check_objects_after_upload; max_unexpected_write_error_retries = settings.s3_max_unexpected_write_error_retries; - if (settings.s3_max_get_rps) - get_request_throttler = std::make_shared( - settings.s3_max_get_rps, settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * settings.s3_max_get_rps); - if (settings.s3_max_put_rps) - put_request_throttler = std::make_shared( - settings.s3_max_put_rps, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * settings.s3_max_put_rps); } -void S3Settings::RequestSettings::updateFromSettingsIfEmpty(const Settings & settings) +void S3Settings::ReadWriteSettings::updateFromSettingsIfEmpty(const Settings & settings) { if (!max_single_read_retries) max_single_read_retries = settings.s3_max_single_read_retries; @@ -137,12 +122,6 @@ void S3Settings::RequestSettings::updateFromSettingsIfEmpty(const Settings & set if (!max_unexpected_write_error_retries) max_unexpected_write_error_retries = settings.s3_max_unexpected_write_error_retries; check_objects_after_upload = settings.s3_check_objects_after_upload; - if (!get_request_throttler && settings.s3_max_get_rps) - get_request_throttler = std::make_shared( - settings.s3_max_get_rps, settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * settings.s3_max_get_rps); - if (!put_request_throttler && settings.s3_max_put_rps) - put_request_throttler = std::make_shared( - settings.s3_max_put_rps, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * settings.s3_max_put_rps); } } diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h index 955cd2d025b..bd90ba569d8 100644 --- a/src/Storages/StorageS3Settings.h +++ b/src/Storages/StorageS3Settings.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -24,7 +23,7 @@ struct Settings; struct S3Settings { - struct RequestSettings + struct ReadWriteSettings { size_t max_single_read_retries = 0; size_t min_upload_part_size = 0; @@ -36,13 +35,11 @@ struct S3Settings size_t max_connections = 0; bool check_objects_after_upload = false; size_t max_unexpected_write_error_retries = 0; - ThrottlerPtr get_request_throttler; - ThrottlerPtr put_request_throttler; - RequestSettings() = default; - explicit RequestSettings(const Settings & settings); + ReadWriteSettings() = default; + explicit ReadWriteSettings(const Settings & settings); - inline bool operator==(const RequestSettings & other) const + inline bool operator==(const ReadWriteSettings & other) const { return max_single_read_retries == other.max_single_read_retries && min_upload_part_size == other.min_upload_part_size @@ -53,20 +50,18 @@ struct S3Settings && max_single_operation_copy_size == other.max_single_operation_copy_size && max_connections == other.max_connections && check_objects_after_upload == other.check_objects_after_upload - && max_unexpected_write_error_retries == other.max_unexpected_write_error_retries - && get_request_throttler == other.get_request_throttler - && put_request_throttler == other.put_request_throttler; + && max_unexpected_write_error_retries == other.max_unexpected_write_error_retries; } void updateFromSettingsIfEmpty(const Settings & settings); }; S3::AuthSettings auth_settings; - RequestSettings request_settings; + ReadWriteSettings rw_settings; inline bool operator==(const S3Settings & other) const { - return auth_settings == other.auth_settings && request_settings == other.request_settings; + return auth_settings == other.auth_settings && rw_settings == other.rw_settings; } }; diff --git a/tests/queries/0_stateless/02477_s3_request_throttler.reference b/tests/queries/0_stateless/02477_s3_request_throttler.reference deleted file mode 100644 index 9315e86b328..00000000000 --- a/tests/queries/0_stateless/02477_s3_request_throttler.reference +++ /dev/null @@ -1,2 +0,0 @@ -1 -1 1 1 diff --git a/tests/queries/0_stateless/02477_s3_request_throttler.sql b/tests/queries/0_stateless/02477_s3_request_throttler.sql deleted file mode 100644 index 25653b1bab0..00000000000 --- a/tests/queries/0_stateless/02477_s3_request_throttler.sql +++ /dev/null @@ -1,26 +0,0 @@ --- Tags: no-fasttest --- Tag no-fasttest: needs s3 - --- Limit S3 PUT request per second rate -SET s3_max_put_rps = 2; -SET s3_max_put_burst = 1; - -CREATE TEMPORARY TABLE times (t DateTime); - --- INSERT query requires 3 PUT requests and 1/rps = 0.5 second in between, the first query is not throttled due to burst -INSERT INTO times SELECT now(); -INSERT INTO TABLE FUNCTION s3('http://localhost:11111/test/request-throttler.csv', 'test', 'testtest', 'CSV', 'number UInt64') SELECT number FROM numbers(1000000) SETTINGS s3_max_single_part_upload_size = 10000, s3_truncate_on_insert = 1; -INSERT INTO times SELECT now(); - -SELECT max(t) - min(t) >= 1 FROM times; - -SYSTEM FLUSH LOGS; -SELECT ProfileEvents['S3CreateMultipartUpload'] == 1, - ProfileEvents['S3UploadPart'] == 1, - ProfileEvents['S3CompleteMultipartUpload'] == 1 -FROM system.query_log -WHERE query LIKE '%request-throttler.csv%' -AND type = 'QueryFinish' -AND current_database = currentDatabase() -ORDER BY query_start_time DESC -LIMIT 1; From 93dde0b3f922efdb9882eb7c9f2b50eda693a1a3 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 16 Nov 2022 16:18:20 -0500 Subject: [PATCH 62/78] Update docs for Date32 --- docs/en/sql-reference/data-types/date32.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/data-types/date32.md b/docs/en/sql-reference/data-types/date32.md index c8c7470d2cb..db41fdf2fc3 100644 --- a/docs/en/sql-reference/data-types/date32.md +++ b/docs/en/sql-reference/data-types/date32.md @@ -6,7 +6,7 @@ sidebar_label: Date32 # Date32 -A date. Supports the date range same with [DateTime64](../../sql-reference/data-types/datetime64.md). Stored in four bytes as the number of days since 1900-01-01. Allows storing values till 2299-12-31. +A date. Supports the date range same with [DateTime64](../../sql-reference/data-types/datetime64.md). Stored as a signed 32-bit integer in native byte order with the value representing the days since 1970-01-01 (0 represents 1970-01-01 and negative values represent the days before 1970). **Examples** From f4eb430caad8d5e5410b29ab0d5267e0048e8961 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 17 Nov 2022 03:14:22 +0100 Subject: [PATCH 63/78] Revert "remove changes" This reverts commit 4016fd2efcb196a97bfc157f661acf6c165f1fa8. --- CHANGELOG.md | 109 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68767612892..051bde44dd2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ ### Table of Contents +**[ClickHouse release v22.11, 2022-11-17](#2211)**
**[ClickHouse release v22.10, 2022-10-25](#2210)**
**[ClickHouse release v22.9, 2022-09-22](#229)**
**[ClickHouse release v22.8-lts, 2022-08-18](#228)**
@@ -11,6 +12,108 @@ **[ClickHouse release v22.1, 2022-01-18](#221)**
**[Changelog for 2021](https://clickhouse.com/docs/en/whats-new/changelog/2021/)**
+### ClickHouse release 22.11, 2022-11-17 + +#### Backward Incompatible Change +* `JSONExtract` family of functions will now attempt to coerce to the requested type. [#41502](https://github.com/ClickHouse/ClickHouse/pull/41502) ([Márcio Martins](https://github.com/marcioapm)). + +#### New Feature +* Adds support for retries during INSERTs into ReplicatedMergeTree when a session with ClickHouse Keeper is lost. Apart from fault tolerance, it aims to provide better user experience, - avoid returning a user an error during insert if keeper is restarted (for example, due to upgrade). [#42607](https://github.com/ClickHouse/ClickHouse/pull/42607) ([Igor Nikonov](https://github.com/devcrafter)). +* Add `Hudi` and `DeltaLake` table engines, read-only, only for tables on S3. [#41054](https://github.com/ClickHouse/ClickHouse/pull/41054) ([Daniil Rubin](https://github.com/rubin-do), [Kseniia Sumarokova](https://github.com/kssenii)). +* Add table function `hudi` and `deltaLake`. [#43080](https://github.com/ClickHouse/ClickHouse/pull/43080) ([flynn](https://github.com/ucasfl)). +* Support for composite time intervals. 1. Add, subtract and negate operations are now available on Intervals. In the case where the types of Intervals are different, they will be transformed into the Tuple of those types. 2. A tuple of intervals can be added to or subtracted from a Date/DateTime field. 3. Added parsing of Intervals with different types, for example: `INTERVAL '1 HOUR 1 MINUTE 1 SECOND'`. [#42195](https://github.com/ClickHouse/ClickHouse/pull/42195) ([Nikolay Degterinsky](https://github.com/evillique)). +* Added `**` glob support for recursive directory traversal of the filesystem and S3. Resolves [#36316](https://github.com/ClickHouse/ClickHouse/issues/36316). [#42376](https://github.com/ClickHouse/ClickHouse/pull/42376) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Introduce `s3_plain` disk type for write-once-read-many operations. Implement `ATTACH` of `MergeTree` table for `s3_plain` disk. [#42628](https://github.com/ClickHouse/ClickHouse/pull/42628) ([Azat Khuzhin](https://github.com/azat)). +* Added applied row-level policies to `system.query_log`. [#39819](https://github.com/ClickHouse/ClickHouse/pull/39819) ([Vladimir Chebotaryov](https://github.com/quickhouse)). +* Add four-letter command `csnp` for manually creating snapshots in ClickHouse Keeper. Additionally, `lgif` was added to get Raft information for a specific node (e.g. index of last created snapshot, last committed log index). [#41766](https://github.com/ClickHouse/ClickHouse/pull/41766) ([JackyWoo](https://github.com/JackyWoo)). +* Add function `ascii` like in Apache Spark: https://spark.apache.org/docs/latest/api/sql/#ascii. [#42670](https://github.com/ClickHouse/ClickHouse/pull/42670) ([李扬](https://github.com/taiyang-li)). +* Add function `pmod` which returns non-negative result based on modulo. [#42755](https://github.com/ClickHouse/ClickHouse/pull/42755) ([李扬](https://github.com/taiyang-li)). +* Add function `formatReadableDecimalSize`. [#42774](https://github.com/ClickHouse/ClickHouse/pull/42774) ([Alejandro](https://github.com/alexon1234)). +* Add function `randCanonical`, which is similar to the `rand` function in Apache Spark or Impala. The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). [#43124](https://github.com/ClickHouse/ClickHouse/pull/43124) ([李扬](https://github.com/taiyang-li)). +* Add function `displayName`, closes [#36770](https://github.com/ClickHouse/ClickHouse/issues/36770). [#37681](https://github.com/ClickHouse/ClickHouse/pull/37681) ([hongbin](https://github.com/xlwh)). +* Add `min_age_to_force_merge_on_partition_only` setting to optimize old parts for the entire partition only. [#42659](https://github.com/ClickHouse/ClickHouse/pull/42659) ([Antonio Andelic](https://github.com/antonio2368)). +* Add generic implementation for arbitrary structured named collections, access type and `system.named_collections`. [#43147](https://github.com/ClickHouse/ClickHouse/pull/43147) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Performance Improvement +* `match` function can use the index if it's a condition on string prefix. This closes [#37333](https://github.com/ClickHouse/ClickHouse/issues/37333). [#42458](https://github.com/ClickHouse/ClickHouse/pull/42458) ([clarkcaoliu](https://github.com/Clark0)). +* Speed up AND and OR operators when they are sequenced. [#42214](https://github.com/ClickHouse/ClickHouse/pull/42214) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* Support parallel parsing for `LineAsString` input format. This improves performance just slightly. This closes [#42502](https://github.com/ClickHouse/ClickHouse/issues/42502). [#42780](https://github.com/ClickHouse/ClickHouse/pull/42780) ([Kruglov Pavel](https://github.com/Avogar)). +* ClickHouse Keeper performance improvement: improve commit performance for cases when many different nodes have uncommitted states. This should help with cases when a follower node can't sync fast enough. [#42926](https://github.com/ClickHouse/ClickHouse/pull/42926) ([Antonio Andelic](https://github.com/antonio2368)). +* A condition like `NOT LIKE 'prefix%'` can use the primary index. [#42209](https://github.com/ClickHouse/ClickHouse/pull/42209) ([Duc Canh Le](https://github.com/canhld94)). + +#### Experimental Feature +* Support type `Object` inside other types, e.g. `Array(JSON)`. [#36969](https://github.com/ClickHouse/ClickHouse/pull/36969) ([Anton Popov](https://github.com/CurtizJ)). +* Ignore MySQL binlog SAVEPOINT event for MaterializedMySQL. [#42931](https://github.com/ClickHouse/ClickHouse/pull/42931) ([zzsmdfj](https://github.com/zzsmdfj)). Handle (ignore) SAVEPOINT queries in MaterializedMySQL. [#43086](https://github.com/ClickHouse/ClickHouse/pull/43086) ([Stig Bakken](https://github.com/stigsb)). + +#### Improvement +* Trivial queries with small LIMIT will properly determine the number of estimated rows to read, so that the threshold will be checked properly. Closes [#7071](https://github.com/ClickHouse/ClickHouse/issues/7071). [#42580](https://github.com/ClickHouse/ClickHouse/pull/42580) ([Han Fei](https://github.com/hanfei1991)). +* Add support for interactive parameters in INSERT VALUES queries. [#43077](https://github.com/ClickHouse/ClickHouse/pull/43077) ([Nikolay Degterinsky](https://github.com/evillique)). +* Added new field `allow_readonly` in `system.table_functions` to allow using table functions in readonly mode. Resolves [#42414](https://github.com/ClickHouse/ClickHouse/issues/42414) Implementation: * Added a new field allow_readonly to table system.table_functions. * Updated to use new field allow_readonly to allow using table functions in readonly mode. Testing: * Added a test for filesystem tests/queries/0_stateless/02473_functions_in_readonly_mode.sh Documentation: * Updated the english documentation for Table Functions. [#42708](https://github.com/ClickHouse/ClickHouse/pull/42708) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* The `system.asynchronous_metrics` gets embedded documentation. This documentation is also exported to Prometheus. Fixed an error with the metrics about `cache` disks - they were calculated only for one arbitrary cache disk instead all of them. This closes [#7644](https://github.com/ClickHouse/ClickHouse/issues/7644). [#43194](https://github.com/ClickHouse/ClickHouse/pull/43194) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Throttling algorithm changed to token bucket. [#42665](https://github.com/ClickHouse/ClickHouse/pull/42665) ([Sergei Trifonov](https://github.com/serxa)). +* Mask passwords and secret keys both in `system.query_log` and `/var/log/clickhouse-server/*.log` and also in error messages. [#42484](https://github.com/ClickHouse/ClickHouse/pull/42484) ([Vitaly Baranov](https://github.com/vitlibar)). +* Remove covered parts for fetched part (to avoid possible replication delay grows). [#39737](https://github.com/ClickHouse/ClickHouse/pull/39737) ([Azat Khuzhin](https://github.com/azat)). +* If `/dev/tty` is available, the progress in clickhouse-client and clickhouse-local will be rendered directly to the terminal, without writing to STDERR. It allows getting progress even if STDERR is redirected to a file, and the file will not be polluted by terminal escape sequences. The progress can be disabled by `--progress false`. This closes [#32238](https://github.com/ClickHouse/ClickHouse/issues/32238). [#42003](https://github.com/ClickHouse/ClickHouse/pull/42003) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add support for `FixedString` input to base64 coding functions. [#42285](https://github.com/ClickHouse/ClickHouse/pull/42285) ([ltrk2](https://github.com/ltrk2)). +* Add columns `bytes_on_disk` and `path` to `system.detached_parts`. Closes [#42264](https://github.com/ClickHouse/ClickHouse/issues/42264). [#42303](https://github.com/ClickHouse/ClickHouse/pull/42303) ([chen](https://github.com/xiedeyantu)). +* Improve using structure from insertion table in table functions, now setting `use_structure_from_insertion_table_in_table_functions` has new possible value - `2` that means that ClickHouse will try to determine if we can use structure from insertion table or not automatically. Closes [#40028](https://github.com/ClickHouse/ClickHouse/issues/40028). [#42320](https://github.com/ClickHouse/ClickHouse/pull/42320) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix no progress indication on INSERT FROM INFILE. Closes [#42548](https://github.com/ClickHouse/ClickHouse/issues/42548). [#42634](https://github.com/ClickHouse/ClickHouse/pull/42634) ([chen](https://github.com/xiedeyantu)). +* Refactor function `tokens` to enable max tokens returned for related functions (disabled by default). [#42673](https://github.com/ClickHouse/ClickHouse/pull/42673) ([李扬](https://github.com/taiyang-li)). +* Allow to use `Date32` arguments for `formatDateTime` and `FROM_UNIXTIME` functions. [#42737](https://github.com/ClickHouse/ClickHouse/pull/42737) ([Roman Vasin](https://github.com/rvasin)). +* Update tzdata to 2022f. Mexico will no longer observe DST except near the US border: https://www.timeanddate.com/news/time/mexico-abolishes-dst-2022.html. Chihuahua moves to year-round UTC-6 on 2022-10-30. Fiji no longer observes DST. See https://github.com/google/cctz/pull/235 and https://bugs.launchpad.net/ubuntu/+source/tzdata/+bug/1995209. [#42796](https://github.com/ClickHouse/ClickHouse/pull/42796) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add `FailedAsyncInsertQuery` event metric for async inserts. [#42814](https://github.com/ClickHouse/ClickHouse/pull/42814) ([Krzysztof Góralski](https://github.com/kgoralski)). +* Implement `read-in-order` optimization on top of query plan. It is enabled by default. Set `query_plan_read_in_order = 0` to use previous AST-based version. [#42829](https://github.com/ClickHouse/ClickHouse/pull/42829) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Increase the size of upload part exponentially for backup to S3 to avoid errors about max 10 000 parts limit of the multipart upload to s3. [#42833](https://github.com/ClickHouse/ClickHouse/pull/42833) ([Vitaly Baranov](https://github.com/vitlibar)). +* When the merge task is continuously busy and the disk space is insufficient, the completely expired parts cannot be selected and dropped, resulting in insufficient disk space. My idea is that when the entire Part expires, there is no need for additional disk space to guarantee, ensure the normal execution of TTL. [#42869](https://github.com/ClickHouse/ClickHouse/pull/42869) ([zhongyuankai](https://github.com/zhongyuankai)). +* Add `oss` function and `OSS` table engine (this is convenient for users). oss is fully compatible with s3. [#43155](https://github.com/ClickHouse/ClickHouse/pull/43155) ([zzsmdfj](https://github.com/zzsmdfj)). +* Improve error reporting in the collection of OS-related info for the `system.asynchronous_metrics` table. [#43192](https://github.com/ClickHouse/ClickHouse/pull/43192) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Modify the `INFORMATION_SCHEMA` tables in a way so that ClickHouse can connect to itself using the MySQL compatibility protocol. Add columns instead of aliases (related to [#9769](https://github.com/ClickHouse/ClickHouse/issues/9769)). It will improve the compatibility with various MySQL clients. [#43198](https://github.com/ClickHouse/ClickHouse/pull/43198) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Add some functions for compatibility with PowerBI, when it connects using MySQL protocol [#42612](https://github.com/ClickHouse/ClickHouse/pull/42612) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Better usability for Dashboard on changes [#42872](https://github.com/ClickHouse/ClickHouse/pull/42872) ([Vladimir C](https://github.com/vdimir)). + +#### Build/Testing/Packaging Improvement +* Run SQLancer for each pull request and commit to master. [SQLancer](https://github.com/sqlancer/sqlancer) is an OpenSource fuzzer that focuses on automatic detection of logical bugs. [#42397](https://github.com/ClickHouse/ClickHouse/pull/42397) ([Ilya Yatsishin](https://github.com/qoega)). +* Update to latest zlib-ng. [#42463](https://github.com/ClickHouse/ClickHouse/pull/42463) ([Boris Kuschel](https://github.com/bkuschel)). +* Add support for testing ClickHouse server with Jepsen. By the way, we already have support for testing ClickHouse Keeper with Jepsen. This pull request extends it to Replicated tables. [#42619](https://github.com/ClickHouse/ClickHouse/pull/42619) ([Antonio Andelic](https://github.com/antonio2368)). +* Use https://github.com/matus-chochlik/ctcache for clang-tidy results caching. [#42913](https://github.com/ClickHouse/ClickHouse/pull/42913) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Before the fix, the user-defined config was preserved by RPM in `$file.rpmsave`. The PR fixes it and won't replace the user's files from packages. [#42936](https://github.com/ClickHouse/ClickHouse/pull/42936) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Remove some libraries from Ubuntu Docker image. [#42622](https://github.com/ClickHouse/ClickHouse/pull/42622) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Updated normaliser to clone the alias ast. Resolves [#42452](https://github.com/ClickHouse/ClickHouse/issues/42452) Implementation: * Updated QueryNormalizer to clone alias ast, when its replaced. Previously just assigning the same leads to exception in LogicalExpressinsOptimizer as it would be the same parent being inserted again. * This bug is not seen with new analyser (allow_experimental_analyzer), so no changes for it. I added a test for the same. [#42827](https://github.com/ClickHouse/ClickHouse/pull/42827) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix race for backup of tables in `Lazy` databases. [#43104](https://github.com/ClickHouse/ClickHouse/pull/43104) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix for `skip_unavailable_shards`: it did not work with the `s3Cluster` table function. [#43131](https://github.com/ClickHouse/ClickHouse/pull/43131) ([chen](https://github.com/xiedeyantu)). +* Fix schema inference in `s3Cluster` and improvement in `hdfsCluster`. [#41979](https://github.com/ClickHouse/ClickHouse/pull/41979) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix retries while reading from URL table engines / table function. (retriable errors could be retries more times than needed, non-retriable errors resulted in failed assertion in code). [#42224](https://github.com/ClickHouse/ClickHouse/pull/42224) ([Kseniia Sumarokova](https://github.com/kssenii)). +* A segmentation fault related to DNS & c-ares has been reported and fixed. [#42234](https://github.com/ClickHouse/ClickHouse/pull/42234) ([Arthur Passos](https://github.com/arthurpassos)). +* Fix `LOGICAL_ERROR` `Arguments of 'plus' have incorrect data types` which may happen in PK analysis (monotonicity check). Fix invalid PK analysis for monotonic binary functions with first constant argument. [#42410](https://github.com/ClickHouse/ClickHouse/pull/42410) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix incorrect key analysis when key types cannot be inside Nullable. This fixes [#42456](https://github.com/ClickHouse/ClickHouse/issues/42456). [#42469](https://github.com/ClickHouse/ClickHouse/pull/42469) ([Amos Bird](https://github.com/amosbird)). +* Fix typo in a setting name that led to bad usage of schema inference cache while using setting `input_format_csv_use_best_effort_in_schema_inference`. Closes [#41735](https://github.com/ClickHouse/ClickHouse/issues/41735). [#42536](https://github.com/ClickHouse/ClickHouse/pull/42536) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix creating a Set with wrong header when data type is LowCardinality. Closes [#42460](https://github.com/ClickHouse/ClickHouse/issues/42460). [#42579](https://github.com/ClickHouse/ClickHouse/pull/42579) ([flynn](https://github.com/ucasfl)). +* `(U)Int128` and `(U)Int256` values were correctly checked in `PREWHERE`. [#42605](https://github.com/ClickHouse/ClickHouse/pull/42605) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix a bug in functions parser that could have led to a segmentation fault. [#42724](https://github.com/ClickHouse/ClickHouse/pull/42724) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix the locking in `truncate table`. [#42728](https://github.com/ClickHouse/ClickHouse/pull/42728) ([flynn](https://github.com/ucasfl)). +* Fix possible crash in `web` disks when file does not exist (or `OPTIMIZE TABLE FINAL`, that also can got the same error eventually). [#42767](https://github.com/ClickHouse/ClickHouse/pull/42767) ([Azat Khuzhin](https://github.com/azat)). +* Fix `auth_type` mapping in `system.session_log`, by including `SSL_CERTIFICATE` for the enum values. [#42782](https://github.com/ClickHouse/ClickHouse/pull/42782) ([Miel Donkers](https://github.com/mdonkers)). +* Fix stack-use-after-return under ASAN build in the Create User query parser. [#42804](https://github.com/ClickHouse/ClickHouse/pull/42804) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix `lowerUTF8`/`upperUTF8` in case of symbol was in between 16-byte boundary (very frequent case of you have strings > 16 bytes long). [#42812](https://github.com/ClickHouse/ClickHouse/pull/42812) ([Azat Khuzhin](https://github.com/azat)). +* Additional bound check was added to LZ4 decompression routine to fix misbehaviour in case of malformed input. [#42868](https://github.com/ClickHouse/ClickHouse/pull/42868) ([Nikita Taranov](https://github.com/nickitat)). +* Fix rare possible hang on query cancellation. [#42874](https://github.com/ClickHouse/ClickHouse/pull/42874) ([Azat Khuzhin](https://github.com/azat)). +* Fix incorrect behavior with multiple disjuncts in hash join, close [#42832](https://github.com/ClickHouse/ClickHouse/issues/42832). [#42876](https://github.com/ClickHouse/ClickHouse/pull/42876) ([Vladimir C](https://github.com/vdimir)). +* A null pointer will be generated when select if as from ‘three table join’ , For example, this SQL query: [#42883](https://github.com/ClickHouse/ClickHouse/pull/42883) ([zzsmdfj](https://github.com/zzsmdfj)). +* Fix memory sanitizer report in Cluster Discovery, close [#42763](https://github.com/ClickHouse/ClickHouse/issues/42763). [#42905](https://github.com/ClickHouse/ClickHouse/pull/42905) ([Vladimir C](https://github.com/vdimir)). +* Improve DateTime schema inference in case of empty string. [#42911](https://github.com/ClickHouse/ClickHouse/pull/42911) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix rare NOT_FOUND_COLUMN_IN_BLOCK error when projection is possible to use but there is no projection available. This fixes [#42771](https://github.com/ClickHouse/ClickHouse/issues/42771) . The bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/25563. [#42938](https://github.com/ClickHouse/ClickHouse/pull/42938) ([Amos Bird](https://github.com/amosbird)). +* Fix ATTACH TABLE in `PostgreSQL` database engine if the table contains DATETIME data type. Closes [#42817](https://github.com/ClickHouse/ClickHouse/issues/42817). [#42960](https://github.com/ClickHouse/ClickHouse/pull/42960) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix lambda parsing. Closes [#41848](https://github.com/ClickHouse/ClickHouse/issues/41848). [#42979](https://github.com/ClickHouse/ClickHouse/pull/42979) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix incorrect key analysis when nullable keys appear in the middle of a hyperrectangle. This fixes [#43111](https://github.com/ClickHouse/ClickHouse/issues/43111) . [#43133](https://github.com/ClickHouse/ClickHouse/pull/43133) ([Amos Bird](https://github.com/amosbird)). +* Fix several buffer over-reads in deserialization of carefully crafted aggregate function states. [#43159](https://github.com/ClickHouse/ClickHouse/pull/43159) ([Raúl Marín](https://github.com/Algunenano)). +* Fix function `if` in case of NULL and const Nullable arguments. Closes [#43069](https://github.com/ClickHouse/ClickHouse/issues/43069). [#43178](https://github.com/ClickHouse/ClickHouse/pull/43178) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix decimal math overflow in parsing DateTime with the 'best effort' algorithm. Closes [#43061](https://github.com/ClickHouse/ClickHouse/issues/43061). [#43180](https://github.com/ClickHouse/ClickHouse/pull/43180) ([Kruglov Pavel](https://github.com/Avogar)). +* The `indent` field produced by the `git-import` tool was miscalculated. See https://clickhouse.com/docs/en/getting-started/example-datasets/github/. [#43191](https://github.com/ClickHouse/ClickHouse/pull/43191) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fixed unexpected behaviour of `Interval` types with subquery and casting. [#43193](https://github.com/ClickHouse/ClickHouse/pull/43193) ([jh0x](https://github.com/jh0x)). + ### ClickHouse release 22.10, 2022-10-26 #### Backward Incompatible Change @@ -570,7 +673,7 @@ * Support SQL standard CREATE INDEX and DROP INDEX syntax. [#35166](https://github.com/ClickHouse/ClickHouse/pull/35166) ([Jianmei Zhang](https://github.com/zhangjmruc)). * Send profile events for INSERT queries (previously only SELECT was supported). [#37391](https://github.com/ClickHouse/ClickHouse/pull/37391) ([Azat Khuzhin](https://github.com/azat)). * Implement in order aggregation (`optimize_aggregation_in_order`) for fully materialized projections. [#37469](https://github.com/ClickHouse/ClickHouse/pull/37469) ([Azat Khuzhin](https://github.com/azat)). -* Remove subprocess run for kerberos initialization. Added new integration test. Closes [#27651](https://github.com/ClickHouse/ClickHouse/issues/27651). [#38105](https://github.com/ClickHouse/ClickHouse/pull/38105) ([Roman Vasin](https://github.com/rvasin)). +* Remove subprocess run for Kerberos initialization. Added new integration test. Closes [#27651](https://github.com/ClickHouse/ClickHouse/issues/27651). [#38105](https://github.com/ClickHouse/ClickHouse/pull/38105) ([Roman Vasin](https://github.com/rvasin)). * * Add setting `multiple_joins_try_to_keep_original_names` to not rewrite identifier name on multiple JOINs rewrite, close [#34697](https://github.com/ClickHouse/ClickHouse/issues/34697). [#38149](https://github.com/ClickHouse/ClickHouse/pull/38149) ([Vladimir C](https://github.com/vdimir)). * Improved trace-visualizer UX. [#38169](https://github.com/ClickHouse/ClickHouse/pull/38169) ([Sergei Trifonov](https://github.com/serxa)). * Enable stack trace collection and query profiler for AArch64. [#38181](https://github.com/ClickHouse/ClickHouse/pull/38181) ([Maksim Kita](https://github.com/kitaisreal)). @@ -850,8 +953,8 @@ #### Upgrade Notes -* Now, background merges, mutations and `OPTIMIZE` will not increment `SelectedRows` and `SelectedBytes` metrics. They (still) will increment `MergedRows` and `MergedUncompressedBytes` as it was before. This only affects the metric values, and makes them better. This change does not introduce any incompatibility, but you may wonder about the changes of metrics, so we put in this category. [#37040](https://github.com/ClickHouse/ClickHouse/pull/37040) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Updated the BoringSSL module to the official FIPS compliant version. This makes ClickHouse FIPS compliant. [#35914](https://github.com/ClickHouse/ClickHouse/pull/35914) ([Meena-Renganathan](https://github.com/Meena-Renganathan)). The ciphers `aes-192-cfb128` and `aes-256-cfb128` were removed, because they are not included in the FIPS certified version of BoringSSL. +* Now, background merges, mutations, and `OPTIMIZE` will not increment `SelectedRows` and `SelectedBytes` metrics. They (still) will increment `MergedRows` and `MergedUncompressedBytes` as it was before. This only affects the metric values and makes them better. This change does not introduce any incompatibility, but you may wonder about the changes to the metrics, so we put in this category. [#37040](https://github.com/ClickHouse/ClickHouse/pull/37040) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Updated the BoringSSL module to the official FIPS compliant version. This makes ClickHouse FIPS compliant in this area. [#35914](https://github.com/ClickHouse/ClickHouse/pull/35914) ([Meena-Renganathan](https://github.com/Meena-Renganathan)). The ciphers `aes-192-cfb128` and `aes-256-cfb128` were removed, because they are not included in the FIPS certified version of BoringSSL. * `max_memory_usage` setting is removed from the default user profile in `users.xml`. This enables flexible memory limits for queries instead of the old rigid limit of 10 GB. * Disable `log_query_threads` setting by default. It controls the logging of statistics about every thread participating in query execution. After supporting asynchronous reads, the total number of distinct thread ids became too large, and logging into the `query_thread_log` has become too heavy. [#37077](https://github.com/ClickHouse/ClickHouse/pull/37077) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Remove function `groupArraySorted` which has a bug. [#36822](https://github.com/ClickHouse/ClickHouse/pull/36822) ([Alexey Milovidov](https://github.com/alexey-milovidov)). From aa855d86a12c048359d68fb88ea9c55b5c5168a4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 17 Nov 2022 05:51:20 +0300 Subject: [PATCH 64/78] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 051bde44dd2..9fc3a589374 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,7 @@ * `JSONExtract` family of functions will now attempt to coerce to the requested type. [#41502](https://github.com/ClickHouse/ClickHouse/pull/41502) ([Márcio Martins](https://github.com/marcioapm)). #### New Feature -* Adds support for retries during INSERTs into ReplicatedMergeTree when a session with ClickHouse Keeper is lost. Apart from fault tolerance, it aims to provide better user experience, - avoid returning a user an error during insert if keeper is restarted (for example, due to upgrade). [#42607](https://github.com/ClickHouse/ClickHouse/pull/42607) ([Igor Nikonov](https://github.com/devcrafter)). +* Adds support for retries during INSERTs into ReplicatedMergeTree when a session with ClickHouse Keeper is lost. Apart from fault tolerance, it aims to provide better user experience, - avoid returning a user an error during insert if keeper is restarted (for example, due to upgrade). This is controlled by the `insert_keeper_max_retries` setting, which is disabled by default. [#42607](https://github.com/ClickHouse/ClickHouse/pull/42607) ([Igor Nikonov](https://github.com/devcrafter)). * Add `Hudi` and `DeltaLake` table engines, read-only, only for tables on S3. [#41054](https://github.com/ClickHouse/ClickHouse/pull/41054) ([Daniil Rubin](https://github.com/rubin-do), [Kseniia Sumarokova](https://github.com/kssenii)). * Add table function `hudi` and `deltaLake`. [#43080](https://github.com/ClickHouse/ClickHouse/pull/43080) ([flynn](https://github.com/ucasfl)). * Support for composite time intervals. 1. Add, subtract and negate operations are now available on Intervals. In the case where the types of Intervals are different, they will be transformed into the Tuple of those types. 2. A tuple of intervals can be added to or subtracted from a Date/DateTime field. 3. Added parsing of Intervals with different types, for example: `INTERVAL '1 HOUR 1 MINUTE 1 SECOND'`. [#42195](https://github.com/ClickHouse/ClickHouse/pull/42195) ([Nikolay Degterinsky](https://github.com/evillique)). From c67f5e26002a0deaeb4d35c0734f8507eab04031 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 17 Nov 2022 04:40:09 +0100 Subject: [PATCH 65/78] Update gtest_column_dump_structure.cpp --- src/Columns/tests/gtest_column_dump_structure.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Columns/tests/gtest_column_dump_structure.cpp b/src/Columns/tests/gtest_column_dump_structure.cpp index 69769ba0db9..e00c77798c8 100644 --- a/src/Columns/tests/gtest_column_dump_structure.cpp +++ b/src/Columns/tests/gtest_column_dump_structure.cpp @@ -10,7 +10,7 @@ TEST(IColumn, dumpStructure) { auto type_lc = std::make_shared(std::make_shared()); ColumnPtr column_lc = type_lc->createColumn(); - auto expected_structure = "ColumnLowCardinality(size = 0, UInt8(size = 0), ColumnUnique(size = 1, String(size = 1)))"; + String expected_structure = "ColumnLowCardinality(size = 0, UInt8(size = 0), ColumnUnique(size = 1, String(size = 1)))"; std::vector threads; for (size_t i = 0; i < 6; ++i) From 10d7ce98154e3532f36072f331dd90973571f1a5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 17 Nov 2022 05:41:03 +0100 Subject: [PATCH 66/78] pmod: compatibility with Spark, better documentation --- CHANGELOG.md | 2 +- src/Functions/modulo.cpp | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 051bde44dd2..034ba26897e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,7 +27,7 @@ * Added applied row-level policies to `system.query_log`. [#39819](https://github.com/ClickHouse/ClickHouse/pull/39819) ([Vladimir Chebotaryov](https://github.com/quickhouse)). * Add four-letter command `csnp` for manually creating snapshots in ClickHouse Keeper. Additionally, `lgif` was added to get Raft information for a specific node (e.g. index of last created snapshot, last committed log index). [#41766](https://github.com/ClickHouse/ClickHouse/pull/41766) ([JackyWoo](https://github.com/JackyWoo)). * Add function `ascii` like in Apache Spark: https://spark.apache.org/docs/latest/api/sql/#ascii. [#42670](https://github.com/ClickHouse/ClickHouse/pull/42670) ([李扬](https://github.com/taiyang-li)). -* Add function `pmod` which returns non-negative result based on modulo. [#42755](https://github.com/ClickHouse/ClickHouse/pull/42755) ([李扬](https://github.com/taiyang-li)). +* Add function `positive_modulo` (`pmod`) which returns non-negative result based on modulo. [#42755](https://github.com/ClickHouse/ClickHouse/pull/42755) ([李扬](https://github.com/taiyang-li)). * Add function `formatReadableDecimalSize`. [#42774](https://github.com/ClickHouse/ClickHouse/pull/42774) ([Alejandro](https://github.com/alexon1234)). * Add function `randCanonical`, which is similar to the `rand` function in Apache Spark or Impala. The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). [#43124](https://github.com/ClickHouse/ClickHouse/pull/43124) ([李扬](https://github.com/taiyang-li)). * Add function `displayName`, closes [#36770](https://github.com/ClickHouse/ClickHouse/issues/36770). [#37681](https://github.com/ClickHouse/ClickHouse/pull/37681) ([hongbin](https://github.com/xlwh)). diff --git a/src/Functions/modulo.cpp b/src/Functions/modulo.cpp index 9cd104cd1dc..be052b25af4 100644 --- a/src/Functions/modulo.cpp +++ b/src/Functions/modulo.cpp @@ -182,7 +182,7 @@ REGISTER_FUNCTION(ModuloLegacy) struct NamePositiveModulo { - static constexpr auto name = "positive_modulo"; + static constexpr auto name = "positiveModulo"; }; using FunctionPositiveModulo = BinaryArithmeticOverloadResolver; @@ -191,11 +191,17 @@ REGISTER_FUNCTION(PositiveModulo) factory.registerFunction( { R"( -Calculates the remainder when dividing `a` by `b`. Similar to function `modulo` except that `positive_modulo` always return non-negative number. +Calculates the remainder when dividing `a` by `b`. Similar to function `modulo` except that `positiveModulo` always return non-negative number. +Returns the difference between `a` and the nearest integer not greater than `a` divisible by `b`. +In other words, the function returning the modulus (modulo) in the terms of Modular Arithmetic. )", - Documentation::Examples{{"positive_modulo", "SELECT positive_modulo(-1000, 32);"}}, + Documentation::Examples{{"positiveModulo", "SELECT positiveModulo(-1, 10);"}}, Documentation::Categories{"Arithmetic"}}, FunctionFactory::CaseInsensitive); + + factory.registerAlias("positive_modulo", "positiveModulo", FunctionFactory::CaseInsensitive); + /// Compatibility with Spark: + factory.registerAlias("pmod", "positiveModulo", FunctionFactory::CaseInsensitive); } } From 41b557f926cadbed6e62020064ae6d419d33d001 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 17 Nov 2022 09:16:47 +0000 Subject: [PATCH 67/78] Disable DeltaLake and hudi table functions in readonly mode --- .../{TableFunctionDelta.cpp => TableFunctionDeltaLake.cpp} | 6 +++--- .../{TableFunctionDelta.h => TableFunctionDeltaLake.h} | 0 src/TableFunctions/TableFunctionHudi.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename src/TableFunctions/{TableFunctionDelta.cpp => TableFunctionDeltaLake.cpp} (96%) rename src/TableFunctions/{TableFunctionDelta.h => TableFunctionDeltaLake.h} (100%) diff --git a/src/TableFunctions/TableFunctionDelta.cpp b/src/TableFunctions/TableFunctionDeltaLake.cpp similarity index 96% rename from src/TableFunctions/TableFunctionDelta.cpp rename to src/TableFunctions/TableFunctionDeltaLake.cpp index 25ea2aaa77f..0f5801d57ac 100644 --- a/src/TableFunctions/TableFunctionDelta.cpp +++ b/src/TableFunctions/TableFunctionDeltaLake.cpp @@ -13,7 +13,7 @@ # include # include # include -# include +# include # include # include "registerTableFunctions.h" @@ -160,9 +160,9 @@ void registerTableFunctionDelta(TableFunctionFactory & factory) factory.registerFunction( {.documentation = {R"(The table function can be used to read the DeltaLake table stored on object store.)", - Documentation::Examples{{"hudi", "SELECT * FROM deltaLake(url, access_key_id, secret_access_key)"}}, + Documentation::Examples{{"deltaLake", "SELECT * FROM deltaLake(url, access_key_id, secret_access_key)"}}, Documentation::Categories{"DataLake"}}, - .allow_readonly = true}); + .allow_readonly = false}); } } diff --git a/src/TableFunctions/TableFunctionDelta.h b/src/TableFunctions/TableFunctionDeltaLake.h similarity index 100% rename from src/TableFunctions/TableFunctionDelta.h rename to src/TableFunctions/TableFunctionDeltaLake.h diff --git a/src/TableFunctions/TableFunctionHudi.cpp b/src/TableFunctions/TableFunctionHudi.cpp index b1db90da550..2e27d192b58 100644 --- a/src/TableFunctions/TableFunctionHudi.cpp +++ b/src/TableFunctions/TableFunctionHudi.cpp @@ -162,7 +162,7 @@ void registerTableFunctionHudi(TableFunctionFactory & factory) = {R"(The table function can be used to read the Hudi table stored on object store.)", Documentation::Examples{{"hudi", "SELECT * FROM hudi(url, access_key_id, secret_access_key)"}}, Documentation::Categories{"DataLake"}}, - .allow_readonly = true}); + .allow_readonly = false}); } } From 11b535d443f12504c79a48203e86d029005f04e6 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Thu, 17 Nov 2022 11:17:27 +0100 Subject: [PATCH 68/78] impl (#43283) --- docs/en/sql-reference/functions/random-functions.md | 2 +- src/Functions/canonicalRand.cpp | 4 ++-- tests/queries/0_stateless/01047_nullable_rand.sql | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index 08f2620a009..4efa2131eb6 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -24,7 +24,7 @@ Returns a pseudo-random UInt64 number, evenly distributed among all UInt64-type Uses a linear congruential generator. -## canonicalRand +## randCanonical The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). Non-deterministic. Return type is Float64. diff --git a/src/Functions/canonicalRand.cpp b/src/Functions/canonicalRand.cpp index d0b8c655e14..0f168142177 100644 --- a/src/Functions/canonicalRand.cpp +++ b/src/Functions/canonicalRand.cpp @@ -34,7 +34,7 @@ private: struct NameCanonicalRand { - static constexpr auto name = "canonicalRand"; + static constexpr auto name = "randCanonical"; }; class FunctionCanonicalRand : public FunctionRandomImpl @@ -52,7 +52,7 @@ REGISTER_FUNCTION(CanonicalRand) The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). Non-deterministic. Return type is Float64. )", - Documentation::Examples{{"canonicalRand", "SELECT canonicalRand()"}}, + Documentation::Examples{{"randCanonical", "SELECT randCanonical()"}}, Documentation::Categories{"Mathematical"}}); } diff --git a/tests/queries/0_stateless/01047_nullable_rand.sql b/tests/queries/0_stateless/01047_nullable_rand.sql index 9d3c361c543..e5633637db6 100644 --- a/tests/queries/0_stateless/01047_nullable_rand.sql +++ b/tests/queries/0_stateless/01047_nullable_rand.sql @@ -1,13 +1,13 @@ select toTypeName(rand(cast(4 as Nullable(UInt8)))); -select toTypeName(canonicalRand(CAST(4 as Nullable(UInt8)))); +select toTypeName(randCanonical(CAST(4 as Nullable(UInt8)))); select toTypeName(randConstant(CAST(4 as Nullable(UInt8)))); select toTypeName(rand(Null)); -select toTypeName(canonicalRand(Null)); +select toTypeName(randCanonical(Null)); select toTypeName(randConstant(Null)); select rand(cast(4 as Nullable(UInt8))) * 0; -select canonicalRand(cast(4 as Nullable(UInt8))) * 0; +select randCanonical(cast(4 as Nullable(UInt8))) * 0; select randConstant(CAST(4 as Nullable(UInt8))) * 0; select rand(Null) * 0; -select canonicalRand(Null) * 0; +select randCanonical(Null) * 0; select randConstant(Null) * 0; From 7beb58b0cf2cd04fdf7548ce0574bbb1ec6280a5 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Thu, 17 Nov 2022 13:19:02 +0100 Subject: [PATCH 69/78] Optimize merge of uniqExact without_key (#43072) * impl for uniqExact * rm unused (read|write)Text methods * fix style * small fixes * impl for variadic uniqExact * refactor * fix style * more agressive inlining * disable if max_threads=1 * small improvements * review fixes * Revert "rm unused (read|write)Text methods" This reverts commit a7e74805842572f6fa2c28ea111ab8ca7c19ad21. * encapsulate is_able_to_parallelize_merge in Data * encapsulate is_exact & argument_is_tuple in Data --- .../AggregateFunctionUniq.cpp | 56 ++-- .../AggregateFunctionUniq.h | 307 +++++++++++++++--- src/AggregateFunctions/Helpers.h | 13 + src/AggregateFunctions/IAggregateFunction.h | 15 +- src/AggregateFunctions/UniqExactSet.h | 112 +++++++ src/AggregateFunctions/UniquesHashSet.h | 2 +- src/Common/HashTable/HashSet.h | 57 ++++ src/Common/HashTable/HashTable.h | 20 +- src/Common/HashTable/TwoLevelHashTable.h | 8 +- src/Common/examples/small_table.cpp | 2 +- src/Common/tests/gtest_hash_table.cpp | 56 ++++ src/Interpreters/Aggregator.cpp | 12 +- .../test_aggregate_function_state.py | 228 +++++++++++++ .../test_aggregate_function_state_avg.py | 82 ----- tests/performance/uniq_without_key.xml | 33 ++ 15 files changed, 832 insertions(+), 171 deletions(-) create mode 100644 src/AggregateFunctions/UniqExactSet.h create mode 100644 tests/integration/test_backward_compatibility/test_aggregate_function_state.py delete mode 100644 tests/integration/test_backward_compatibility/test_aggregate_function_state_avg.py create mode 100644 tests/performance/uniq_without_key.xml diff --git a/src/AggregateFunctions/AggregateFunctionUniq.cpp b/src/AggregateFunctions/AggregateFunctionUniq.cpp index 0d1c831c839..1c90767131c 100644 --- a/src/AggregateFunctions/AggregateFunctionUniq.cpp +++ b/src/AggregateFunctions/AggregateFunctionUniq.cpp @@ -9,6 +9,7 @@ #include #include +#include namespace DB { @@ -28,8 +29,9 @@ namespace /** `DataForVariadic` is a data structure that will be used for `uniq` aggregate function of multiple arguments. * It differs, for example, in that it uses a trivial hash function, since `uniq` of many arguments first hashes them out itself. */ -template -AggregateFunctionPtr createAggregateFunctionUniq(const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *) +template typename DataForVariadic> +AggregateFunctionPtr +createAggregateFunctionUniq(const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *) { assertNoParameters(name, params); @@ -61,21 +63,22 @@ AggregateFunctionPtr createAggregateFunctionUniq(const std::string & name, const else if (which.isTuple()) { if (use_exact_hash_function) - return std::make_shared>(argument_types); + return std::make_shared>>(argument_types); else - return std::make_shared>(argument_types); + return std::make_shared>>(argument_types); } } /// "Variadic" method also works as a fallback generic case for single argument. if (use_exact_hash_function) - return std::make_shared>(argument_types); + return std::make_shared>>(argument_types); else - return std::make_shared>(argument_types); + return std::make_shared>>(argument_types); } -template class Data, typename DataForVariadic> -AggregateFunctionPtr createAggregateFunctionUniq(const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *) +template typename Data, template typename DataForVariadic, bool is_able_to_parallelize_merge> +AggregateFunctionPtr +createAggregateFunctionUniq(const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *) { assertNoParameters(name, params); @@ -91,35 +94,35 @@ AggregateFunctionPtr createAggregateFunctionUniq(const std::string & name, const { const IDataType & argument_type = *argument_types[0]; - AggregateFunctionPtr res(createWithNumericType(*argument_types[0], argument_types)); + AggregateFunctionPtr res(createWithNumericType(*argument_types[0], argument_types)); WhichDataType which(argument_type); if (res) return res; else if (which.isDate()) - return std::make_shared>>(argument_types); + return std::make_shared>>(argument_types); else if (which.isDate32()) - return std::make_shared>>(argument_types); + return std::make_shared>>(argument_types); else if (which.isDateTime()) - return std::make_shared>>(argument_types); + return std::make_shared>>(argument_types); else if (which.isStringOrFixedString()) - return std::make_shared>>(argument_types); + return std::make_shared>>(argument_types); else if (which.isUUID()) - return std::make_shared>>(argument_types); + return std::make_shared>>(argument_types); else if (which.isTuple()) { if (use_exact_hash_function) - return std::make_shared>(argument_types); + return std::make_shared>>(argument_types); else - return std::make_shared>(argument_types); + return std::make_shared>>(argument_types); } } /// "Variadic" method also works as a fallback generic case for single argument. if (use_exact_hash_function) - return std::make_shared>(argument_types); + return std::make_shared>>(argument_types); else - return std::make_shared>(argument_types); + return std::make_shared>>(argument_types); } } @@ -132,14 +135,23 @@ void registerAggregateFunctionsUniq(AggregateFunctionFactory & factory) {createAggregateFunctionUniq, properties}); factory.registerFunction("uniqHLL12", - {createAggregateFunctionUniq, properties}); + {createAggregateFunctionUniq, properties}); - factory.registerFunction("uniqExact", - {createAggregateFunctionUniq>, properties}); + auto assign_bool_param = [](const std::string & name, const DataTypes & argument_types, const Array & params, const Settings * settings) + { + /// Using two level hash set if we wouldn't be able to merge in parallel can cause ~10% slowdown. + if (settings && settings->max_threads > 1) + return createAggregateFunctionUniq< + true, AggregateFunctionUniqExactData, AggregateFunctionUniqExactDataForVariadic, true /* is_able_to_parallelize_merge */>(name, argument_types, params, settings); + else + return createAggregateFunctionUniq< + true, AggregateFunctionUniqExactData, AggregateFunctionUniqExactDataForVariadic, false /* is_able_to_parallelize_merge */>(name, argument_types, params, settings); + }; + factory.registerFunction("uniqExact", {assign_bool_param, properties}); #if USE_DATASKETCHES factory.registerFunction("uniqTheta", - {createAggregateFunctionUniq, properties}); + {createAggregateFunctionUniq, properties}); #endif } diff --git a/src/AggregateFunctions/AggregateFunctionUniq.h b/src/AggregateFunctions/AggregateFunctionUniq.h index fe2530800cc..1a98bfc8456 100644 --- a/src/AggregateFunctions/AggregateFunctionUniq.h +++ b/src/AggregateFunctions/AggregateFunctionUniq.h @@ -1,7 +1,10 @@ #pragma once -#include +#include +#include #include +#include +#include #include @@ -13,17 +16,18 @@ #include +#include #include #include #include -#include -#include #include +#include -#include #include #include +#include #include +#include namespace DB @@ -37,94 +41,128 @@ struct AggregateFunctionUniqUniquesHashSetData using Set = UniquesHashSet>; Set set; + constexpr static bool is_able_to_parallelize_merge = false; + constexpr static bool is_variadic = false; + static String getName() { return "uniq"; } }; /// For a function that takes multiple arguments. Such a function pre-hashes them in advance, so TrivialHash is used here. +template struct AggregateFunctionUniqUniquesHashSetDataForVariadic { using Set = UniquesHashSet; Set set; + constexpr static bool is_able_to_parallelize_merge = false; + constexpr static bool is_variadic = true; + constexpr static bool is_exact = is_exact_; + constexpr static bool argument_is_tuple = argument_is_tuple_; + static String getName() { return "uniq"; } }; /// uniqHLL12 -template +template struct AggregateFunctionUniqHLL12Data { using Set = HyperLogLogWithSmallSetOptimization; Set set; - static String getName() { return "uniqHLL12"; } -}; - -template <> -struct AggregateFunctionUniqHLL12Data -{ - using Set = HyperLogLogWithSmallSetOptimization; - Set set; + constexpr static bool is_able_to_parallelize_merge = is_able_to_parallelize_merge_; + constexpr static bool is_variadic = false; static String getName() { return "uniqHLL12"; } }; template <> -struct AggregateFunctionUniqHLL12Data +struct AggregateFunctionUniqHLL12Data { using Set = HyperLogLogWithSmallSetOptimization; Set set; + constexpr static bool is_able_to_parallelize_merge = false; + constexpr static bool is_variadic = false; + static String getName() { return "uniqHLL12"; } }; +template <> +struct AggregateFunctionUniqHLL12Data +{ + using Set = HyperLogLogWithSmallSetOptimization; + Set set; + + constexpr static bool is_able_to_parallelize_merge = false; + constexpr static bool is_variadic = false; + + static String getName() { return "uniqHLL12"; } +}; + +template struct AggregateFunctionUniqHLL12DataForVariadic { using Set = HyperLogLogWithSmallSetOptimization; Set set; + constexpr static bool is_able_to_parallelize_merge = is_able_to_parallelize_merge_; + constexpr static bool is_variadic = true; + constexpr static bool is_exact = is_exact_; + constexpr static bool argument_is_tuple = argument_is_tuple_; + static String getName() { return "uniqHLL12"; } }; /// uniqExact -template +template struct AggregateFunctionUniqExactData { using Key = T; /// When creating, the hash table must be small. - using Set = HashSet< - Key, - HashCRC32, - HashTableGrower<4>, - HashTableAllocatorWithStackMemory>; + using SingleLevelSet = HashSet, HashTableGrower<4>, HashTableAllocatorWithStackMemory>; + using TwoLevelSet = TwoLevelHashSet>; + using Set = UniqExactSet; Set set; + constexpr static bool is_able_to_parallelize_merge = is_able_to_parallelize_merge_; + constexpr static bool is_variadic = false; + static String getName() { return "uniqExact"; } }; /// For rows, we put the SipHash values (128 bits) into the hash table. -template <> -struct AggregateFunctionUniqExactData +template +struct AggregateFunctionUniqExactData { using Key = UInt128; /// When creating, the hash table must be small. - using Set = HashSet< - Key, - UInt128TrivialHash, - HashTableGrower<3>, - HashTableAllocatorWithStackMemory>; + using SingleLevelSet = HashSet, HashTableAllocatorWithStackMemory>; + using TwoLevelSet = TwoLevelHashSet; + using Set = UniqExactSet; Set set; + constexpr static bool is_able_to_parallelize_merge = is_able_to_parallelize_merge_; + constexpr static bool is_variadic = false; + static String getName() { return "uniqExact"; } }; +template +struct AggregateFunctionUniqExactDataForVariadic : AggregateFunctionUniqExactData +{ + constexpr static bool is_able_to_parallelize_merge = is_able_to_parallelize_merge_; + constexpr static bool is_variadic = true; + constexpr static bool is_exact = is_exact_; + constexpr static bool argument_is_tuple = argument_is_tuple_; +}; /// uniqTheta #if USE_DATASKETCHES @@ -134,14 +172,37 @@ struct AggregateFunctionUniqThetaData using Set = ThetaSketchData; Set set; + constexpr static bool is_able_to_parallelize_merge = false; + constexpr static bool is_variadic = false; + static String getName() { return "uniqTheta"; } }; +template +struct AggregateFunctionUniqThetaDataForVariadic : AggregateFunctionUniqThetaData +{ + constexpr static bool is_able_to_parallelize_merge = false; + constexpr static bool is_variadic = true; + constexpr static bool is_exact = is_exact_; + constexpr static bool argument_is_tuple = argument_is_tuple_; +}; + #endif namespace detail { +template +struct IsUniqExactSet : std::false_type +{ +}; + +template +struct IsUniqExactSet> : std::true_type +{ +}; + + /** Hash function for uniq. */ template struct AggregateFunctionUniqTraits @@ -162,17 +223,31 @@ template struct AggregateFunctionUniqTraits }; -/** The structure for the delegation work to add one element to the `uniq` aggregate functions. +/** The structure for the delegation work to add elements to the `uniq` aggregate functions. * Used for partial specialization to add strings. */ template -struct OneAdder +struct Adder { - static void ALWAYS_INLINE add(Data & data, const IColumn & column, size_t row_num) + /// We have to introduce this template parameter (and a bunch of ugly code dealing with it), because we cannot + /// add runtime branches in whatever_hash_set::insert - it will immediately pop up in the perf top. + template + static void ALWAYS_INLINE add(Data & data, const IColumn ** columns, size_t num_args, size_t row_num) { - if constexpr (std::is_same_v - || std::is_same_v>) + if constexpr (Data::is_variadic) { + if constexpr (IsUniqExactSet::value) + data.set.template insert( + UniqVariadicHash::apply(num_args, columns, row_num)); + else + data.set.insert(T{UniqVariadicHash::apply(num_args, columns, row_num)}); + } + else if constexpr ( + std::is_same_v< + Data, + AggregateFunctionUniqUniquesHashSetData> || std::is_same_v>) + { + const auto & column = *columns[0]; if constexpr (!std::is_same_v) { using ValueType = typename decltype(data.set)::value_type; @@ -185,11 +260,13 @@ struct OneAdder data.set.insert(CityHash_v1_0_2::CityHash64(value.data, value.size)); } } - else if constexpr (std::is_same_v>) + else if constexpr (std::is_same_v>) { + const auto & column = *columns[0]; if constexpr (!std::is_same_v) { - data.set.insert(assert_cast &>(column).getData()[row_num]); + data.set.template insert( + assert_cast &>(column).getData()[row_num]); } else { @@ -200,16 +277,72 @@ struct OneAdder hash.update(value.data, value.size); hash.get128(key); - data.set.insert(key); + data.set.template insert(key); } } #if USE_DATASKETCHES else if constexpr (std::is_same_v) { + const auto & column = *columns[0]; data.set.insertOriginal(column.getDataAt(row_num)); } #endif } + + static void ALWAYS_INLINE + add(Data & data, const IColumn ** columns, size_t num_args, size_t row_begin, size_t row_end, const char8_t * flags, const UInt8 * null_map) + { + bool use_single_level_hash_table = true; + if constexpr (Data::is_able_to_parallelize_merge) + use_single_level_hash_table = data.set.isSingleLevel(); + + if (use_single_level_hash_table) + addImpl(data, columns, num_args, row_begin, row_end, flags, null_map); + else + addImpl(data, columns, num_args, row_begin, row_end, flags, null_map); + + if constexpr (Data::is_able_to_parallelize_merge) + { + if (data.set.isSingleLevel() && data.set.size() > 100'000) + data.set.convertToTwoLevel(); + } + } + +private: + template + static void ALWAYS_INLINE + addImpl(Data & data, const IColumn ** columns, size_t num_args, size_t row_begin, size_t row_end, const char8_t * flags, const UInt8 * null_map) + { + if (!flags) + { + if (!null_map) + { + for (size_t row = row_begin; row < row_end; ++row) + add(data, columns, num_args, row); + } + else + { + for (size_t row = row_begin; row < row_end; ++row) + if (!null_map[row]) + add(data, columns, num_args, row); + } + } + else + { + if (!null_map) + { + for (size_t row = row_begin; row < row_end; ++row) + if (flags[row]) + add(data, columns, num_args, row); + } + else + { + for (size_t row = row_begin; row < row_end; ++row) + if (!null_map[row] && flags[row]) + add(data, columns, num_args, row); + } + } + } }; } @@ -219,9 +352,15 @@ struct OneAdder template class AggregateFunctionUniq final : public IAggregateFunctionDataHelper> { +private: + static constexpr size_t num_args = 1; + static constexpr bool is_able_to_parallelize_merge = Data::is_able_to_parallelize_merge; + public: - AggregateFunctionUniq(const DataTypes & argument_types_) - : IAggregateFunctionDataHelper>(argument_types_, {}) {} + explicit AggregateFunctionUniq(const DataTypes & argument_types_) + : IAggregateFunctionDataHelper>(argument_types_, {}) + { + } String getName() const override { return Data::getName(); } @@ -235,7 +374,18 @@ public: /// ALWAYS_INLINE is required to have better code layout for uniqHLL12 function void ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { - detail::OneAdder::add(this->data(place), *columns[0], row_num); + detail::Adder::add(this->data(place), columns, num_args, row_num); + } + + void ALWAYS_INLINE addBatchSinglePlace( + size_t row_begin, size_t row_end, AggregateDataPtr __restrict place, const IColumn ** columns, Arena *, ssize_t if_argument_pos) + const override + { + const char8_t * flags = nullptr; + if (if_argument_pos >= 0) + flags = assert_cast(*columns[if_argument_pos]).getData().data(); + + detail::Adder::add(this->data(place), columns, num_args, row_begin, row_end, flags, nullptr /* null_map */); } void addManyDefaults( @@ -244,7 +394,23 @@ public: size_t /*length*/, Arena * /*arena*/) const override { - detail::OneAdder::add(this->data(place), *columns[0], 0); + detail::Adder::add(this->data(place), columns, num_args, 0); + } + + void addBatchSinglePlaceNotNull( + size_t row_begin, + size_t row_end, + AggregateDataPtr __restrict place, + const IColumn ** columns, + const UInt8 * null_map, + Arena *, + ssize_t if_argument_pos) const override + { + const char8_t * flags = nullptr; + if (if_argument_pos >= 0) + flags = assert_cast(*columns[if_argument_pos]).getData().data(); + + detail::Adder::add(this->data(place), columns, num_args, row_begin, row_end, flags, null_map); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override @@ -252,6 +418,16 @@ public: this->data(place).set.merge(this->data(rhs).set); } + bool isAbleToParallelizeMerge() const override { return is_able_to_parallelize_merge; } + + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena *) const override + { + if constexpr (is_able_to_parallelize_merge) + this->data(place).set.merge(this->data(rhs).set, &thread_pool); + else + this->data(place).set.merge(this->data(rhs).set); + } + void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional /* version */) const override { this->data(place).set.write(buf); @@ -273,15 +449,20 @@ public: * You can pass multiple arguments as is; You can also pass one argument - a tuple. * But (for the possibility of efficient implementation), you can not pass several arguments, among which there are tuples. */ -template -class AggregateFunctionUniqVariadic final : public IAggregateFunctionDataHelper> +template +class AggregateFunctionUniqVariadic final : public IAggregateFunctionDataHelper> { private: + using T = typename Data::Set::value_type; + + static constexpr size_t is_able_to_parallelize_merge = Data::is_able_to_parallelize_merge; + static constexpr size_t argument_is_tuple = Data::argument_is_tuple; + size_t num_args = 0; public: - AggregateFunctionUniqVariadic(const DataTypes & arguments) - : IAggregateFunctionDataHelper>(arguments, {}) + explicit AggregateFunctionUniqVariadic(const DataTypes & arguments) + : IAggregateFunctionDataHelper>(arguments, {}) { if (argument_is_tuple) num_args = typeid_cast(*arguments[0]).getElements().size(); @@ -300,8 +481,34 @@ public: void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { - this->data(place).set.insert(typename Data::Set::value_type( - UniqVariadicHash::apply(num_args, columns, row_num))); + detail::Adder::add(this->data(place), columns, num_args, row_num); + } + + void addBatchSinglePlace( + size_t row_begin, size_t row_end, AggregateDataPtr __restrict place, const IColumn ** columns, Arena *, ssize_t if_argument_pos) + const override + { + const char8_t * flags = nullptr; + if (if_argument_pos >= 0) + flags = assert_cast(*columns[if_argument_pos]).getData().data(); + + detail::Adder::add(this->data(place), columns, num_args, row_begin, row_end, flags, nullptr /* null_map */); + } + + void addBatchSinglePlaceNotNull( + size_t row_begin, + size_t row_end, + AggregateDataPtr __restrict place, + const IColumn ** columns, + const UInt8 * null_map, + Arena *, + ssize_t if_argument_pos) const override + { + const char8_t * flags = nullptr; + if (if_argument_pos >= 0) + flags = assert_cast(*columns[if_argument_pos]).getData().data(); + + detail::Adder::add(this->data(place), columns, num_args, row_begin, row_end, flags, null_map); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override @@ -309,6 +516,16 @@ public: this->data(place).set.merge(this->data(rhs).set); } + bool isAbleToParallelizeMerge() const override { return is_able_to_parallelize_merge; } + + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena *) const override + { + if constexpr (is_able_to_parallelize_merge) + this->data(place).set.merge(this->data(rhs).set, &thread_pool); + else + this->data(place).set.merge(this->data(rhs).set); + } + void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional /* version */) const override { this->data(place).set.write(buf); diff --git a/src/AggregateFunctions/Helpers.h b/src/AggregateFunctions/Helpers.h index 6e140f4b9cf..c97733571a3 100644 --- a/src/AggregateFunctions/Helpers.h +++ b/src/AggregateFunctions/Helpers.h @@ -74,6 +74,19 @@ static IAggregateFunction * createWithNumericType(const IDataType & argument_typ return nullptr; } +template