From 58755cb156fcade11af5151114220536dcf12615 Mon Sep 17 00:00:00 2001 From: wuxiaobai24 Date: Sun, 8 May 2022 14:02:40 +0800 Subject: [PATCH 01/37] add zstd_window_log_max setting --- src/Core/Settings.h | 2 ++ src/IO/CompressionMethod.cpp | 10 ++++++++++ src/IO/CompressionMethod.h | 10 +++++++++- src/IO/ZstdInflatingReadBuffer.cpp | 8 +++++++- src/IO/ZstdInflatingReadBuffer.h | 3 ++- src/Server/HTTPHandler.cpp | 2 +- src/Storages/HDFS/StorageHDFS.cpp | 6 ++++-- src/Storages/StorageFile.cpp | 4 ++-- src/Storages/StorageS3.cpp | 7 +++++-- src/Storages/StorageURL.cpp | 6 ++++-- 10 files changed, 46 insertions(+), 12 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index be73465eea0..895e10659e5 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -176,6 +176,8 @@ class IColumn; \ M(Int64, network_zstd_compression_level, 1, "Allows you to select the level of ZSTD compression.", 0) \ \ + M(Int64, zstd_window_log_max, 0, "Allows you to select the max window log of ZSTD", 0) \ + \ M(UInt64, priority, 0, "Priority of the query. 1 - the highest, higher value - lower priority; 0 - do not use priorities.", 0) \ M(Int64, os_thread_priority, 0, "If non zero - set corresponding 'nice' value for query processing threads. Can be used to adjust query priority for OS scheduler.", 0) \ \ diff --git a/src/IO/CompressionMethod.cpp b/src/IO/CompressionMethod.cpp index fe4772948ad..dc955322e16 100644 --- a/src/IO/CompressionMethod.cpp +++ b/src/IO/CompressionMethod.cpp @@ -125,6 +125,16 @@ static std::unique_ptr createCompressedWrapper( throw Exception("Unsupported compression method", ErrorCodes::NOT_IMPLEMENTED); } +std::unique_ptr wrapReadBufferWithCompressionMethod( + std::unique_ptr nested, CompressionMethod method, const Settings &settings, size_t buf_size, char * existing_memory, size_t alignment) +{ + if (method == CompressionMethod::None) + return nested; + else if (method == CompressionMethod::Zstd) + return std::make_unique(std::move(nested), buf_size, existing_memory, alignment, settings.zstd_window_log_max); + return createCompressedWrapper(std::move(nested), method, buf_size, existing_memory, alignment); +} + std::unique_ptr wrapReadBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment) { diff --git a/src/IO/CompressionMethod.h b/src/IO/CompressionMethod.h index 3953ba9d212..e87c7e5e7a1 100644 --- a/src/IO/CompressionMethod.h +++ b/src/IO/CompressionMethod.h @@ -4,7 +4,7 @@ #include #include - +#include namespace DB { @@ -47,6 +47,14 @@ std::string toContentEncodingName(CompressionMethod method); */ CompressionMethod chooseCompressionMethod(const std::string & path, const std::string & hint); +std::unique_ptr wrapReadBufferWithCompressionMethod( + std::unique_ptr nested, + CompressionMethod method, + const Settings &settings, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + char * existing_memory = nullptr, + size_t alignment = 0); + std::unique_ptr wrapReadBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, diff --git a/src/IO/ZstdInflatingReadBuffer.cpp b/src/IO/ZstdInflatingReadBuffer.cpp index 712ea6960ef..0d026cdab9a 100644 --- a/src/IO/ZstdInflatingReadBuffer.cpp +++ b/src/IO/ZstdInflatingReadBuffer.cpp @@ -8,7 +8,7 @@ namespace ErrorCodes extern const int ZSTD_DECODER_FAILED; } -ZstdInflatingReadBuffer::ZstdInflatingReadBuffer(std::unique_ptr in_, size_t buf_size, char * existing_memory, size_t alignment) +ZstdInflatingReadBuffer::ZstdInflatingReadBuffer(std::unique_ptr in_, size_t buf_size, char * existing_memory, size_t alignment, int zstd_window_log_max) : CompressedReadBufferWrapper(std::move(in_), buf_size, existing_memory, alignment) { dctx = ZSTD_createDCtx(); @@ -19,6 +19,12 @@ ZstdInflatingReadBuffer::ZstdInflatingReadBuffer(std::unique_ptr in_ { throw Exception(ErrorCodes::ZSTD_DECODER_FAILED, "zstd_stream_decoder init failed: zstd version: {}", ZSTD_VERSION_STRING); } + + size_t ret = ZSTD_DCtx_setParameter(dctx, ZSTD_d_windowLogMax, zstd_window_log_max); + if (ZSTD_isError(ret)) + { + throw Exception(ErrorCodes::ZSTD_DECODER_FAILED, "zstd_stream_decoder init failed: {}", ZSTD_getErrorName(ret)); + } } ZstdInflatingReadBuffer::~ZstdInflatingReadBuffer() diff --git a/src/IO/ZstdInflatingReadBuffer.h b/src/IO/ZstdInflatingReadBuffer.h index a0c20b79d80..faa6231d4e2 100644 --- a/src/IO/ZstdInflatingReadBuffer.h +++ b/src/IO/ZstdInflatingReadBuffer.h @@ -20,7 +20,8 @@ public: std::unique_ptr in_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, - size_t alignment = 0); + size_t alignment = 0, + int zstd_window_log_max = 0); ~ZstdInflatingReadBuffer() override; diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index 0ce81ec7be4..cac62a18ce4 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -642,7 +642,7 @@ void HTTPHandler::processQuery( /// Request body can be compressed using algorithm specified in the Content-Encoding header. String http_request_compression_method_str = request.get("Content-Encoding", ""); auto in_post = wrapReadBufferWithCompressionMethod( - wrapReadBufferReference(request.getStream()), chooseCompressionMethod({}, http_request_compression_method_str)); + wrapReadBufferReference(request.getStream()), chooseCompressionMethod({}, http_request_compression_method_str), context->getSettingsRef()); /// The data can also be compressed using incompatible internal algorithm. This is indicated by /// 'decompress' query parameter. diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 441504a3e7e..762cb513c14 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -194,8 +194,9 @@ ColumnsDescription StorageHDFS::getTableStructureFromData( if (it == paths.end()) return nullptr; auto compression = chooseCompressionMethod(*it, compression_method); + const auto &settings = ctx->getSettingsRef(); return wrapReadBufferWithCompressionMethod( - std::make_unique(uri_without_path, *it++, ctx->getGlobalContext()->getConfigRef()), compression); + std::make_unique(uri_without_path, *it++, ctx->getGlobalContext()->getConfigRef()), compression, settings); }; return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, paths.size() > 1, ctx); } @@ -324,7 +325,8 @@ bool HDFSSource::initialize() const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_path); auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); - read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef()), compression); + const auto &settings = getContext()->getSettingsRef(); + read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef()), compression, settings); auto input_format = getContext()->getInputFormat(storage->format_name, *read_buf, block_for_format, max_block_size); diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index c460b8a4c67..52ec0e4e48e 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -206,8 +206,8 @@ std::unique_ptr createReadBuffer( auto & in = static_cast(*nested_buffer); in.setProgressCallback(context); } - - return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); + const auto &settings = context->getSettingsRef(); + return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method, settings); } } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 301f33cb309..72ca9d6401f 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -302,7 +302,8 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; - read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint)); + const auto &settings = getContext()->getSettingsRef(); + read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint), settings); auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; @@ -1017,10 +1018,12 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( read_keys_in_distributed_processing->push_back(key); first = false; + const auto &settings = ctx->getSettingsRef(); return wrapReadBufferWithCompressionMethod( std::make_unique( s3_configuration.client, s3_configuration.uri.bucket, key, s3_configuration.uri.version_id, s3_configuration.rw_settings.max_single_read_retries, ctx->getReadSettings()), - chooseCompressionMethod(key, compression_method)); + chooseCompressionMethod(key, compression_method), + settings); }; return readSchemaFromFormat(format, format_settings, read_buffer_iterator, is_key_with_globs, ctx); diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 35752835581..a3cdb36a652 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -349,7 +349,8 @@ namespace std::move(read_buffer_factory), threadPoolCallbackRunner(IOThreadPool::get()), download_threads), - chooseCompressionMethod(request_uri.getPath(), compression_method)); + chooseCompressionMethod(request_uri.getPath(), compression_method), + settings); } } catch (const Poco::Exception & e) @@ -380,7 +381,8 @@ namespace delay_initialization, /* use_external_buffer */ false, /* skip_url_not_found_error */ skip_url_not_found_error), - chooseCompressionMethod(request_uri.getPath(), compression_method)); + chooseCompressionMethod(request_uri.getPath(), compression_method), + settings); } catch (...) { From bb2b2bfa474ce775b422613cb3141d74b06a07b1 Mon Sep 17 00:00:00 2001 From: wuxiaobai24 Date: Mon, 9 May 2022 13:57:27 +0800 Subject: [PATCH 02/37] add test --- .../02293_test_zstd_window_log_max.reference | 2 ++ .../0_stateless/02293_test_zstd_window_log_max.sh | 11 +++++++++++ 2 files changed, 13 insertions(+) create mode 100644 tests/queries/0_stateless/02293_test_zstd_window_log_max.reference create mode 100755 tests/queries/0_stateless/02293_test_zstd_window_log_max.sh diff --git a/tests/queries/0_stateless/02293_test_zstd_window_log_max.reference b/tests/queries/0_stateless/02293_test_zstd_window_log_max.reference new file mode 100644 index 00000000000..98ca7fb2d29 --- /dev/null +++ b/tests/queries/0_stateless/02293_test_zstd_window_log_max.reference @@ -0,0 +1,2 @@ +1 +40 diff --git a/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh b/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh new file mode 100755 index 00000000000..764b0e8b2d3 --- /dev/null +++ b/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# reuse the test data in 02293_test_zstd_window_log_max.sh +$CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max=20" 2>&1 | grep -c \ + "Code: 561. DB::Exception: Zstd stream encoding failed: error 'Frame requires too much memory for decoding'; zstd version: 1.5.0: While executing File. (ZSTD_DECODER_FAILED)" +$CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max=21" \ No newline at end of file From f5f60eb9f363df3382ef6d5fc260f0d9ee1c8131 Mon Sep 17 00:00:00 2001 From: wuxiaobai24 Date: Mon, 9 May 2022 14:01:23 +0800 Subject: [PATCH 03/37] fix typo --- tests/queries/0_stateless/02293_test_zstd_window_log_max.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh b/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh index 764b0e8b2d3..39d1b443739 100755 --- a/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh +++ b/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh @@ -5,7 +5,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -# reuse the test data in 02293_test_zstd_window_log_max.sh +# reuse the test data in 01946_test_zstd_decompression_with_escape_sequence_at_the_end_of_buffer.sh $CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max=20" 2>&1 | grep -c \ "Code: 561. DB::Exception: Zstd stream encoding failed: error 'Frame requires too much memory for decoding'; zstd version: 1.5.0: While executing File. (ZSTD_DECODER_FAILED)" $CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max=21" \ No newline at end of file From 85356bbf6468e7c22265c6505d1b85f3d67b1d18 Mon Sep 17 00:00:00 2001 From: wuxiaobai24 Date: Wed, 11 May 2022 00:53:09 +0800 Subject: [PATCH 04/37] fix --- src/Core/Settings.h | 2 +- src/IO/CompressionMethod.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 895e10659e5..27450e634e3 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -176,7 +176,7 @@ class IColumn; \ M(Int64, network_zstd_compression_level, 1, "Allows you to select the level of ZSTD compression.", 0) \ \ - M(Int64, zstd_window_log_max, 0, "Allows you to select the max window log of ZSTD", 0) \ + M(Int64, zstd_window_log_max, 0, "Allows you to select the max window log of ZSTD (it will not be used for MergeTree family)", 0) \ \ M(UInt64, priority, 0, "Priority of the query. 1 - the highest, higher value - lower priority; 0 - do not use priorities.", 0) \ M(Int64, os_thread_priority, 0, "If non zero - set corresponding 'nice' value for query processing threads. Can be used to adjust query priority for OS scheduler.", 0) \ diff --git a/src/IO/CompressionMethod.h b/src/IO/CompressionMethod.h index e87c7e5e7a1..e99e9695837 100644 --- a/src/IO/CompressionMethod.h +++ b/src/IO/CompressionMethod.h @@ -4,12 +4,12 @@ #include #include -#include namespace DB { class ReadBuffer; class WriteBuffer; +struct Settings; /** These are "generally recognizable" compression methods for data import/export. * Do not mess with more efficient compression methods used by ClickHouse internally From ab5636a46a3e40715ca52248565409f516bea44c Mon Sep 17 00:00:00 2001 From: wuxiaobai24 Date: Fri, 13 May 2022 17:52:39 +0800 Subject: [PATCH 05/37] fix --- src/IO/CompressionMethod.cpp | 19 ++++--------------- src/IO/CompressionMethod.h | 9 +-------- src/Server/HTTPHandler.cpp | 2 +- src/Storages/HDFS/StorageHDFS.cpp | 8 ++++---- src/Storages/StorageFile.cpp | 5 +++-- src/Storages/StorageS3.cpp | 8 ++++---- src/Storages/StorageURL.cpp | 4 ++-- 7 files changed, 19 insertions(+), 36 deletions(-) diff --git a/src/IO/CompressionMethod.cpp b/src/IO/CompressionMethod.cpp index dc955322e16..0da235c074c 100644 --- a/src/IO/CompressionMethod.cpp +++ b/src/IO/CompressionMethod.cpp @@ -99,7 +99,7 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s } static std::unique_ptr createCompressedWrapper( - std::unique_ptr nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment) + std::unique_ptr nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment, int zstd_window_log_max) { if (method == CompressionMethod::Gzip || method == CompressionMethod::Zlib) return std::make_unique(std::move(nested), method, buf_size, existing_memory, alignment); @@ -110,7 +110,7 @@ static std::unique_ptr createCompressedWrapper( if (method == CompressionMethod::Xz) return std::make_unique(std::move(nested), buf_size, existing_memory, alignment); if (method == CompressionMethod::Zstd) - return std::make_unique(std::move(nested), buf_size, existing_memory, alignment); + return std::make_unique(std::move(nested), buf_size, existing_memory, alignment, zstd_window_log_max); if (method == CompressionMethod::Lz4) return std::make_unique(std::move(nested), buf_size, existing_memory, alignment); #if USE_BZIP2 @@ -126,24 +126,13 @@ static std::unique_ptr createCompressedWrapper( } std::unique_ptr wrapReadBufferWithCompressionMethod( - std::unique_ptr nested, CompressionMethod method, const Settings &settings, size_t buf_size, char * existing_memory, size_t alignment) + std::unique_ptr nested, CompressionMethod method, int zstd_window_log_max, size_t buf_size, char * existing_memory, size_t alignment) { if (method == CompressionMethod::None) return nested; - else if (method == CompressionMethod::Zstd) - return std::make_unique(std::move(nested), buf_size, existing_memory, alignment, settings.zstd_window_log_max); - return createCompressedWrapper(std::move(nested), method, buf_size, existing_memory, alignment); + return createCompressedWrapper(std::move(nested), method, buf_size, existing_memory, alignment, zstd_window_log_max); } -std::unique_ptr wrapReadBufferWithCompressionMethod( - std::unique_ptr nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment) -{ - if (method == CompressionMethod::None) - return nested; - return createCompressedWrapper(std::move(nested), method, buf_size, existing_memory, alignment); -} - - std::unique_ptr wrapWriteBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, int level, size_t buf_size, char * existing_memory, size_t alignment) { diff --git a/src/IO/CompressionMethod.h b/src/IO/CompressionMethod.h index e99e9695837..a399a756c13 100644 --- a/src/IO/CompressionMethod.h +++ b/src/IO/CompressionMethod.h @@ -9,7 +9,6 @@ namespace DB { class ReadBuffer; class WriteBuffer; -struct Settings; /** These are "generally recognizable" compression methods for data import/export. * Do not mess with more efficient compression methods used by ClickHouse internally @@ -50,17 +49,11 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s std::unique_ptr wrapReadBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, - const Settings &settings, + int zstd_window_log_max = 0, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0); -std::unique_ptr wrapReadBufferWithCompressionMethod( - std::unique_ptr nested, - CompressionMethod method, - size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - char * existing_memory = nullptr, - size_t alignment = 0); std::unique_ptr wrapWriteBufferWithCompressionMethod( std::unique_ptr nested, diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index cac62a18ce4..4193491ec9e 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -642,7 +642,7 @@ void HTTPHandler::processQuery( /// Request body can be compressed using algorithm specified in the Content-Encoding header. String http_request_compression_method_str = request.get("Content-Encoding", ""); auto in_post = wrapReadBufferWithCompressionMethod( - wrapReadBufferReference(request.getStream()), chooseCompressionMethod({}, http_request_compression_method_str), context->getSettingsRef()); + wrapReadBufferReference(request.getStream()), chooseCompressionMethod({}, http_request_compression_method_str), context->getSettingsRef().zstd_window_log_max); /// The data can also be compressed using incompatible internal algorithm. This is indicated by /// 'decompress' query parameter. diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 762cb513c14..d9de785e60a 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -194,9 +194,9 @@ ColumnsDescription StorageHDFS::getTableStructureFromData( if (it == paths.end()) return nullptr; auto compression = chooseCompressionMethod(*it, compression_method); - const auto &settings = ctx->getSettingsRef(); + auto zstd_window_log_max = ctx->getSettingsRef().zstd_window_log_max; return wrapReadBufferWithCompressionMethod( - std::make_unique(uri_without_path, *it++, ctx->getGlobalContext()->getConfigRef()), compression, settings); + std::make_unique(uri_without_path, *it++, ctx->getGlobalContext()->getConfigRef()), compression, zstd_window_log_max); }; return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, paths.size() > 1, ctx); } @@ -325,8 +325,8 @@ bool HDFSSource::initialize() const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_path); auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); - const auto &settings = getContext()->getSettingsRef(); - read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef()), compression, settings); + const auto zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; + read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef()), compression, zstd_window_log_max); auto input_format = getContext()->getInputFormat(storage->format_name, *read_buf, block_for_format, max_block_size); diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 52ec0e4e48e..9ba3d809d47 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -206,8 +206,9 @@ std::unique_ptr createReadBuffer( auto & in = static_cast(*nested_buffer); in.setProgressCallback(context); } - const auto &settings = context->getSettingsRef(); - return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method, settings); + + auto zstd_window_log_max = context->getSettingsRef().zstd_window_log_max; + return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method, zstd_window_log_max); } } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 72ca9d6401f..d307517afdf 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -302,8 +302,8 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; - const auto &settings = getContext()->getSettingsRef(); - read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint), settings); + auto zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; + read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint), zstd_window_log_max); auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; @@ -1018,12 +1018,12 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( read_keys_in_distributed_processing->push_back(key); first = false; - const auto &settings = ctx->getSettingsRef(); + const auto zstd_window_log_max = ctx->getSettingsRef().zstd_window_log_max; return wrapReadBufferWithCompressionMethod( std::make_unique( s3_configuration.client, s3_configuration.uri.bucket, key, s3_configuration.uri.version_id, s3_configuration.rw_settings.max_single_read_retries, ctx->getReadSettings()), chooseCompressionMethod(key, compression_method), - settings); + zstd_window_log_max); }; return readSchemaFromFormat(format, format_settings, read_buffer_iterator, is_key_with_globs, ctx); diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index a3cdb36a652..74d44912251 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -350,7 +350,7 @@ namespace threadPoolCallbackRunner(IOThreadPool::get()), download_threads), chooseCompressionMethod(request_uri.getPath(), compression_method), - settings); + settings.zstd_window_log_max); } } catch (const Poco::Exception & e) @@ -382,7 +382,7 @@ namespace /* use_external_buffer */ false, /* skip_url_not_found_error */ skip_url_not_found_error), chooseCompressionMethod(request_uri.getPath(), compression_method), - settings); + settings.zstd_window_log_max); } catch (...) { From 4cd7e65d972319af3b3b0eeb5daf6fedb2254197 Mon Sep 17 00:00:00 2001 From: wuxiaobai24 Date: Fri, 13 May 2022 18:24:50 +0800 Subject: [PATCH 06/37] fix style check --- src/Storages/StorageFile.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 9ba3d809d47..504d71bb6c0 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -206,7 +206,7 @@ std::unique_ptr createReadBuffer( auto & in = static_cast(*nested_buffer); in.setProgressCallback(context); } - + auto zstd_window_log_max = context->getSettingsRef().zstd_window_log_max; return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method, zstd_window_log_max); } From 17fb5d80687d644734e61aa880daf5db1a6d58a6 Mon Sep 17 00:00:00 2001 From: Suzy Wang Date: Wed, 1 Jun 2022 21:30:02 -0700 Subject: [PATCH 07/37] upgrade curl to 7.83.1 --- contrib/curl | 2 +- contrib/curl-cmake/CMakeLists.txt | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/contrib/curl b/contrib/curl index 801bd5138ce..462196e6b4a 160000 --- a/contrib/curl +++ b/contrib/curl @@ -1 +1 @@ -Subproject commit 801bd5138ce31aa0d906fa4e2eabfc599d74e793 +Subproject commit 462196e6b4a47f924293a0e26b8e9c23d37ac26f diff --git a/contrib/curl-cmake/CMakeLists.txt b/contrib/curl-cmake/CMakeLists.txt index b1e1a0ded8a..761ee036e66 100644 --- a/contrib/curl-cmake/CMakeLists.txt +++ b/contrib/curl-cmake/CMakeLists.txt @@ -84,7 +84,6 @@ set (SRCS "${LIBRARY_DIR}/lib/gopher.c" "${LIBRARY_DIR}/lib/idn_win32.c" "${LIBRARY_DIR}/lib/http_proxy.c" - "${LIBRARY_DIR}/lib/non-ascii.c" "${LIBRARY_DIR}/lib/asyn-thread.c" "${LIBRARY_DIR}/lib/curl_gssapi.c" "${LIBRARY_DIR}/lib/http_ntlm.c" @@ -93,10 +92,8 @@ set (SRCS "${LIBRARY_DIR}/lib/curl_sasl.c" "${LIBRARY_DIR}/lib/rand.c" "${LIBRARY_DIR}/lib/curl_multibyte.c" - "${LIBRARY_DIR}/lib/hostcheck.c" "${LIBRARY_DIR}/lib/conncache.c" "${LIBRARY_DIR}/lib/dotdot.c" - "${LIBRARY_DIR}/lib/x509asn1.c" "${LIBRARY_DIR}/lib/http2.c" "${LIBRARY_DIR}/lib/smb.c" "${LIBRARY_DIR}/lib/curl_endian.c" @@ -120,6 +117,9 @@ set (SRCS "${LIBRARY_DIR}/lib/http_aws_sigv4.c" "${LIBRARY_DIR}/lib/mqtt.c" "${LIBRARY_DIR}/lib/rename.c" + "${LIBRARY_DIR}/lib/h2h3.c" + "${LIBRARY_DIR}/lib/headers.c" + "${LIBRARY_DIR}/lib/timediff.c" "${LIBRARY_DIR}/lib/vauth/vauth.c" "${LIBRARY_DIR}/lib/vauth/cleartext.c" "${LIBRARY_DIR}/lib/vauth/cram.c" @@ -142,11 +142,13 @@ set (SRCS "${LIBRARY_DIR}/lib/vtls/sectransp.c" "${LIBRARY_DIR}/lib/vtls/gskit.c" "${LIBRARY_DIR}/lib/vtls/mbedtls.c" - "${LIBRARY_DIR}/lib/vtls/mesalink.c" "${LIBRARY_DIR}/lib/vtls/bearssl.c" "${LIBRARY_DIR}/lib/vtls/keylog.c" + "${LIBRARY_DIR}/lib/vtls/x509asn1.c" + "${LIBRARY_DIR}/lib/vtls/hostcheck.c" "${LIBRARY_DIR}/lib/vquic/ngtcp2.c" "${LIBRARY_DIR}/lib/vquic/quiche.c" + "${LIBRARY_DIR}/lib/vquic/msh3.c" "${LIBRARY_DIR}/lib/vssh/libssh2.c" "${LIBRARY_DIR}/lib/vssh/libssh.c" ) From 0588142b7ecdd21bbafb58c34527d65709266212 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 2 Jun 2022 11:56:06 +0200 Subject: [PATCH 08/37] Clarify that match() & like() assume UTF-8 The previous explanation sentence "The regular expression works with the string as if it is a set of bytes." suggested otherwise and since we don't have separate methods matchUTF8() and likeUTF8(), it makes sense to clarify. --- docs/en/sql-reference/functions/string-search-functions.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 985d9f1e63a..8d35204d783 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -350,11 +350,12 @@ In all `multiSearch*` functions the number of needles should be less than 2 ## match(haystack, pattern) {#matchhaystack-pattern} -Checks whether the string matches the `pattern` regular expression. A `re2` regular expression. The [syntax](https://github.com/google/re2/wiki/Syntax) of the `re2` regular expressions is more limited than the syntax of the Perl regular expressions. +Checks whether the string matches the regular expression `pattern` in `re2` syntax. `Re2` has a more limited [syntax](https://github.com/google/re2/wiki/Syntax) than Perl regular expressions. Returns 0 if it does not match, or 1 if it matches. -The regular expression works with the string as if it is a set of bytes. The regular expression can’t contain null bytes. +Matching is based on UTF-8, e.g. `.` matches the two-codepoint symbol `¥`. The regular expression must not contain null bytes. + For patterns to search for substrings in a string, it is better to use LIKE or ‘position’, since they work much faster. ## multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyhaystack-pattern1-pattern2-patternn} @@ -498,6 +499,8 @@ The regular expression can contain the metasymbols `%` and `_`. Use the backslash (`\`) for escaping metasymbols. See the note on escaping in the description of the ‘match’ function. +Matching is based on UTF-8, e.g. `_` matches the two-codepoint symbol `¥`. + For regular expressions like `%needle%`, the code is more optimal and works as fast as the `position` function. For other regular expressions, the code is the same as for the ‘match’ function. From e55a441604b7146cf19e47eca39f3510658b8a13 Mon Sep 17 00:00:00 2001 From: Suzy Wang Date: Fri, 3 Jun 2022 08:10:55 -0700 Subject: [PATCH 09/37] trigger rebuild From b7b9936cc4635cc993c595242897da7c598fa88d Mon Sep 17 00:00:00 2001 From: Suzy Wang Date: Mon, 6 Jun 2022 07:37:32 -0700 Subject: [PATCH 10/37] Retrigger CI From fddd031385516c0e5150623ae4ea2e385017dd71 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 4 May 2022 23:36:26 +0300 Subject: [PATCH 11/37] Randomize settings related to in-order read/aggregation Signed-off-by: Azat Khuzhin --- tests/clickhouse-test | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 355f7b7a712..648072eea24 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -420,6 +420,10 @@ class SettingsRandomizer: "max_block_size": lambda: random.randint(8000, 100000), "max_threads": lambda: random.randint(1, 64), "optimize_or_like_chain": lambda: random.randint(0, 1), + "optimize_read_in_order": lambda: random.randint(0, 1), + "read_in_order_two_level_merge_threshold": lambda: random.randint(0, 100), + "optimize_aggregation_in_order": lambda: random.randint(0, 1), + "aggregation_in_order_max_block_bytes": lambda: random.randint(0, 50000000), } @staticmethod From 23d4837e0a62dad85c569152b0c8f8cf454dd152 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 5 May 2022 08:03:56 +0300 Subject: [PATCH 12/37] tests: add explicit settings/ORDER BY for in order reading/aggregation v2: rebase on top of upstream/master v3: fix 01780_column_sparse_full v4: fix 01410_nullable_key_and_index Signed-off-by: Azat Khuzhin --- .../0_stateless/01045_order_by_pk_special_storages.sh | 6 +++--- tests/queries/0_stateless/01232_untuple.reference | 8 ++++---- tests/queries/0_stateless/01232_untuple.sql | 2 +- tests/queries/0_stateless/01268_mergine_sorted_limit.sql | 4 ++-- .../0_stateless/01323_add_scalars_in_time.reference | 2 +- tests/queries/0_stateless/01323_add_scalars_in_time.sql | 3 ++- .../0_stateless/01410_nullable_key_and_index.reference | 4 ++-- .../queries/0_stateless/01410_nullable_key_and_index.sql | 3 ++- .../01513_optimize_aggregation_in_order_memory_long.sql | 2 +- .../0_stateless/01521_alter_enum_and_reverse_read.sql | 2 +- .../01562_optimize_monotonous_functions_in_order_by.sql | 1 + tests/queries/0_stateless/01786_explain_merge_tree.sh | 2 +- .../0_stateless/02149_read_in_order_fixed_prefix.sql | 2 ++ .../0_stateless/02155_read_in_order_max_rows_to_read.sql | 1 + .../02233_optimize_aggregation_in_order_prefix.reference | 4 ++-- .../02233_optimize_aggregation_in_order_prefix.sql | 4 ++-- 16 files changed, 28 insertions(+), 22 deletions(-) diff --git a/tests/queries/0_stateless/01045_order_by_pk_special_storages.sh b/tests/queries/0_stateless/01045_order_by_pk_special_storages.sh index a46fedb4533..bb76f3978cc 100755 --- a/tests/queries/0_stateless/01045_order_by_pk_special_storages.sh +++ b/tests/queries/0_stateless/01045_order_by_pk_special_storages.sh @@ -24,7 +24,7 @@ $CLICKHOUSE_CLIENT -q "SELECT a FROM m ORDER BY a LIMIT 5" $CLICKHOUSE_CLIENT -q "SELECT a, s FROM m ORDER BY a, s LIMIT 10" # Not a single .sql test with max_rows_to_read because it doesn't work with Merge storage -rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a FROM m ORDER BY a LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 | grep "rows_read" | sed 's/[^0-9]*//g') +rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a FROM m ORDER BY a LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 --optimize_read_in_order=1 | grep "rows_read" | sed 's/[^0-9]*//g') # Expected number of read rows with a bit margin if [[ $rows_read -lt 500 ]] @@ -36,7 +36,7 @@ fi $CLICKHOUSE_CLIENT -q "SELECT '---StorageBuffer---'" $CLICKHOUSE_CLIENT -q "CREATE TABLE buf (a UInt32, s String) engine = Buffer('$CLICKHOUSE_DATABASE', s2, 16, 10, 100, 10000, 1000000, 10000000, 100000000)" $CLICKHOUSE_CLIENT -q "SELECT a, s FROM buf ORDER BY a, s LIMIT 10" -rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a FROM buf ORDER BY a LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 | grep "rows_read" | sed 's/[^0-9]*//g') +rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a FROM buf ORDER BY a LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 --optimize_read_in_order=1 | grep "rows_read" | sed 's/[^0-9]*//g') # Expected number of read rows with a bit margin if [[ $rows_read -lt 500 ]] @@ -48,7 +48,7 @@ fi $CLICKHOUSE_CLIENT -q "SELECT '---MaterializedView---'" $CLICKHOUSE_CLIENT -q "CREATE MATERIALIZED VIEW mv (a UInt32, s String) engine = MergeTree ORDER BY s SETTINGS min_bytes_for_wide_part = 0 POPULATE AS SELECT a, s FROM s1 WHERE a % 7 = 0" $CLICKHOUSE_CLIENT -q "SELECT a, s FROM mv ORDER BY s LIMIT 10" -rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a, s FROM mv ORDER BY s LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 | grep "rows_read" | sed 's/[^0-9]*//g') +rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a, s FROM mv ORDER BY s LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 --optimize_read_in_order=1 | grep "rows_read" | sed 's/[^0-9]*//g') if [[ $rows_read -lt 500 ]] then echo "OK" diff --git a/tests/queries/0_stateless/01232_untuple.reference b/tests/queries/0_stateless/01232_untuple.reference index 21fd0c4a8a5..8e1f97d2585 100644 --- a/tests/queries/0_stateless/01232_untuple.reference +++ b/tests/queries/0_stateless/01232_untuple.reference @@ -3,11 +3,11 @@ hello 1 3 world 9 9 (0,1) key tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), 1) tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), 2) tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), 3) tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), 4) tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), 5) -4 10 20 10 20 30 -3 70 20 10 20 30 -2 11 20 10 20 30 -5 10 20 10 20 30 1 20 20 10 20 30 +2 11 20 10 20 30 +3 70 20 10 20 30 +4 10 20 10 20 30 +5 10 20 10 20 30 6 10 20 10 20 30 7 18 20 10 20 30 8 30 20 10 20 30 diff --git a/tests/queries/0_stateless/01232_untuple.sql b/tests/queries/0_stateless/01232_untuple.sql index 39ee9e82fa7..92150e92b29 100644 --- a/tests/queries/0_stateless/01232_untuple.sql +++ b/tests/queries/0_stateless/01232_untuple.sql @@ -6,5 +6,5 @@ select argMax(untuple(x)), min(x) from (select (number, number + 1) as x from nu drop table if exists kv; create table kv (key int, v1 int, v2 int, v3 int, v4 int, v5 int) engine MergeTree order by key; insert into kv values (1, 10, 20, 10, 20, 30), (2, 11, 20, 10, 20, 30), (1, 18, 20, 10, 20, 30), (1, 20, 20, 10, 20, 30), (3, 70, 20, 10, 20, 30), (4, 10, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (5, 10, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (8, 30, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (6, 10, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (7, 18, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (7, 10, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (8, 10, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30); -select key, untuple(argMax((* except (key),), v1)) from kv group by key format TSVWithNames; +select key, untuple(argMax((* except (key),), v1)) from kv group by key order by key format TSVWithNames; drop table if exists kv; diff --git a/tests/queries/0_stateless/01268_mergine_sorted_limit.sql b/tests/queries/0_stateless/01268_mergine_sorted_limit.sql index fbe047a3a77..49d8161bf83 100644 --- a/tests/queries/0_stateless/01268_mergine_sorted_limit.sql +++ b/tests/queries/0_stateless/01268_mergine_sorted_limit.sql @@ -6,7 +6,7 @@ INSERT INTO tab VALUES (1,1),(1,2),(1,3),(1,4),(1,5); INSERT INTO tab VALUES (2,6),(2,7),(2,8),(2,9),(2,0); -SELECT * FROM tab ORDER BY x LIMIT 3; -SELECT * FROM tab ORDER BY x LIMIT 4; +SELECT * FROM tab ORDER BY x LIMIT 3 SETTINGS optimize_read_in_order=1; +SELECT * FROM tab ORDER BY x LIMIT 4 SETTINGS optimize_read_in_order=1; DROP TABLE IF EXISTS tab; diff --git a/tests/queries/0_stateless/01323_add_scalars_in_time.reference b/tests/queries/0_stateless/01323_add_scalars_in_time.reference index 408efa7f823..bffe4d46ab2 100644 --- a/tests/queries/0_stateless/01323_add_scalars_in_time.reference +++ b/tests/queries/0_stateless/01323_add_scalars_in_time.reference @@ -1,5 +1,5 @@ -[0,2,3] id2 [1,2,3] id1 +[0,2,3] id2 test [1,2,3,4] 2 fre 3 jhg diff --git a/tests/queries/0_stateless/01323_add_scalars_in_time.sql b/tests/queries/0_stateless/01323_add_scalars_in_time.sql index 2ee5603f760..c337cd86f5b 100644 --- a/tests/queries/0_stateless/01323_add_scalars_in_time.sql +++ b/tests/queries/0_stateless/01323_add_scalars_in_time.sql @@ -16,7 +16,8 @@ WITH SELECT arraySort(arrayIntersect(argMax(seqs, create_time), arr1)) AS common, id FROM tags WHERE id LIKE 'id%' -GROUP BY id; +GROUP BY id +ORDER BY id; DROP TABLE tags; diff --git a/tests/queries/0_stateless/01410_nullable_key_and_index.reference b/tests/queries/0_stateless/01410_nullable_key_and_index.reference index da88fbddd7a..37456e6c8d6 100644 --- a/tests/queries/0_stateless/01410_nullable_key_and_index.reference +++ b/tests/queries/0_stateless/01410_nullable_key_and_index.reference @@ -8,9 +8,9 @@ 14 21 16 24 18 27 -\N 0 -\N -1 \N -2 +\N -1 +\N 0 \N 0 \N -1 \N -2 diff --git a/tests/queries/0_stateless/01410_nullable_key_and_index.sql b/tests/queries/0_stateless/01410_nullable_key_and_index.sql index 969432eba01..905d997d95c 100644 --- a/tests/queries/0_stateless/01410_nullable_key_and_index.sql +++ b/tests/queries/0_stateless/01410_nullable_key_and_index.sql @@ -3,13 +3,14 @@ DROP TABLE IF EXISTS nullable_key_without_final_mark; DROP TABLE IF EXISTS nullable_minmax_index; SET max_threads = 1; +SET optimize_read_in_order=0; CREATE TABLE nullable_key (k Nullable(int), v int) ENGINE MergeTree ORDER BY k SETTINGS allow_nullable_key = 1, index_granularity = 1; INSERT INTO nullable_key SELECT number * 2, number * 3 FROM numbers(10); INSERT INTO nullable_key SELECT NULL, -number FROM numbers(3); -SELECT * FROM nullable_key ORDER BY k; +SELECT * FROM nullable_key ORDER BY k, v; SET force_primary_key = 1; SET max_rows_to_read = 3; diff --git a/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql b/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql index cca994e8e4a..784dd73b865 100644 --- a/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql +++ b/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql @@ -12,7 +12,7 @@ set max_memory_usage='500M'; set max_threads=1; set max_block_size=500; -select key, groupArray(repeat('a', 200)), count() from data_01513 group by key format Null; -- { serverError 241; } +select key, groupArray(repeat('a', 200)), count() from data_01513 group by key format Null settings optimize_aggregation_in_order=0; -- { serverError 241; } select key, groupArray(repeat('a', 200)), count() from data_01513 group by key format Null settings optimize_aggregation_in_order=1; -- for WITH TOTALS previous groups should be kept. select key, groupArray(repeat('a', 200)), count() from data_01513 group by key with totals format Null settings optimize_aggregation_in_order=1; -- { serverError 241; } diff --git a/tests/queries/0_stateless/01521_alter_enum_and_reverse_read.sql b/tests/queries/0_stateless/01521_alter_enum_and_reverse_read.sql index 014790a61c1..b5391517c14 100644 --- a/tests/queries/0_stateless/01521_alter_enum_and_reverse_read.sql +++ b/tests/queries/0_stateless/01521_alter_enum_and_reverse_read.sql @@ -8,6 +8,6 @@ ALTER TABLE enum_test MODIFY COLUMN e Enum8('IU' = 1, 'WS' = 2, 'PS' = 3); INSERT INTO enum_test SELECT '2020-10-09 00:00:00', 'h1', 'PS' from numbers(1); -SELECT * FROM enum_test ORDER BY timestamp, e desc; +SELECT * FROM enum_test ORDER BY timestamp, e desc SETTINGS optimize_read_in_order=1; DROP TABLE IF EXISTS enum_test; diff --git a/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.sql b/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.sql index b31457d8f68..15ddb5a848f 100644 --- a/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.sql +++ b/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.sql @@ -1,4 +1,5 @@ SET optimize_monotonous_functions_in_order_by = 1; +SET optimize_read_in_order = 1; DROP TABLE IF EXISTS test_order_by; diff --git a/tests/queries/0_stateless/01786_explain_merge_tree.sh b/tests/queries/0_stateless/01786_explain_merge_tree.sh index eb47f065044..138905c65e7 100755 --- a/tests/queries/0_stateless/01786_explain_merge_tree.sh +++ b/tests/queries/0_stateless/01786_explain_merge_tree.sh @@ -4,7 +4,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -CLICKHOUSE_CLIENT="$CLICKHOUSE_CLIENT --optimize_move_to_prewhere=1 --convert_query_to_cnf=0" +CLICKHOUSE_CLIENT="$CLICKHOUSE_CLIENT --optimize_move_to_prewhere=1 --convert_query_to_cnf=0 --optimize_read_in_order=1" $CLICKHOUSE_CLIENT -q "drop table if exists test_index" $CLICKHOUSE_CLIENT -q "drop table if exists idx" diff --git a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql index 8fb11ac383c..4dfcbb9bf80 100644 --- a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql +++ b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql @@ -1,4 +1,6 @@ SET max_threads=0; +SET optimize_read_in_order=1; +SET read_in_order_two_level_merge_threshold=100; DROP TABLE IF EXISTS t_read_in_order; diff --git a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql index e82c78b5e42..9846c1208a1 100644 --- a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql +++ b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql @@ -7,6 +7,7 @@ SETTINGS index_granularity = 4; INSERT INTO t_max_rows_to_read SELECT number FROM numbers(100); SET max_threads = 1; +SET optimize_read_in_order = 1; SELECT a FROM t_max_rows_to_read WHERE a = 10 SETTINGS max_rows_to_read = 4; diff --git a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference index 9d252c9f396..e08e89fdff9 100644 --- a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference +++ b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference @@ -20,7 +20,7 @@ ExpressionTransform × 2 ExpressionTransform (ReadFromMergeTree) MergeTreeInOrder 0 → 1 -explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1; +explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0; (Expression) ExpressionTransform × 2 (Sorting) @@ -103,7 +103,7 @@ select parent_key, child_key, count() from data_02233 group by parent_key, child 9 2 3 0 0 100 -select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1; +select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0; 0 0 4 0 1 3 0 2 3 diff --git a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql index cf1e825b03d..5f64c929a5f 100644 --- a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql +++ b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql @@ -8,10 +8,10 @@ SELECT child_key, parent_key, child_key FROM data_02233 GROUP BY parent_key, chi -- { echoOn } insert into data_02233 select number%10, number%3, number from numbers(100); explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1; -explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1; +explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0; select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1; select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1, max_block_size=1; -select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1; +select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0; -- fuzzer SELECT child_key, parent_key, child_key FROM data_02233 GROUP BY parent_key, child_key, child_key ORDER BY child_key, parent_key ASC NULLS LAST SETTINGS max_threads = 1, optimize_aggregation_in_order = 1; From 6cdb0bcb6e73234087b11c8ec8e69732ca5d888d Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 1 Jun 2022 07:18:20 +0300 Subject: [PATCH 13/37] tests: fix 02233_optimize_aggregation_in_order_prefix --- .../02233_optimize_aggregation_in_order_prefix.reference | 4 ++-- .../02233_optimize_aggregation_in_order_prefix.sql | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference index e08e89fdff9..f98effbec67 100644 --- a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference +++ b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference @@ -2,7 +2,7 @@ 0 0 0 -- { echoOn } insert into data_02233 select number%10, number%3, number from numbers(100); -explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1; +explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1, read_in_order_two_level_merge_threshold=1; (Expression) ExpressionTransform × 2 (Sorting) @@ -20,7 +20,7 @@ ExpressionTransform × 2 ExpressionTransform (ReadFromMergeTree) MergeTreeInOrder 0 → 1 -explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0; +explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0, read_in_order_two_level_merge_threshold=1; (Expression) ExpressionTransform × 2 (Sorting) diff --git a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql index 5f64c929a5f..233599feb65 100644 --- a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql +++ b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql @@ -7,8 +7,8 @@ SELECT child_key, parent_key, child_key FROM data_02233 GROUP BY parent_key, chi -- { echoOn } insert into data_02233 select number%10, number%3, number from numbers(100); -explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1; -explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0; +explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1, read_in_order_two_level_merge_threshold=1; +explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0, read_in_order_two_level_merge_threshold=1; select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1; select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1, max_block_size=1; select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0; From 9cac78b49865fc83be8255abac3115a63cfd289a Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 15 Jun 2022 19:54:46 +0200 Subject: [PATCH 14/37] make tests with tsan less flaky --- base/harmful/harmful.c | 31 ++++++++++++++++ contrib/librdkafka | 2 +- docker/test/stress/run.sh | 35 ++++++++++--------- .../00984_parser_stack_overflow.sh | 2 ++ .../01172_transaction_counters.sql | 3 +- .../01183_custom_separated_format_http.sh | 2 ++ .../01184_long_insert_values_huge_strings.sh | 3 +- .../0_stateless/01651_lc_insert_tiny_log.sql | 3 ++ ..._long_zstd_http_compression_json_format.sh | 3 +- .../0_stateless/01926_order_by_desc_limit.sql | 3 +- .../00159_parallel_formatting_http.sh | 2 ++ 11 files changed, 67 insertions(+), 22 deletions(-) diff --git a/base/harmful/harmful.c b/base/harmful/harmful.c index 5a27cae0383..6112f9a339c 100644 --- a/base/harmful/harmful.c +++ b/base/harmful/harmful.c @@ -260,4 +260,35 @@ TRAP(mq_timedreceive) TRAP(wordexp) TRAP(wordfree) +/// C11 threading primitives are not supported by ThreadSanitizer. +/// Also we should avoid using them for compatibility with old libc. +TRAP(thrd_create) +TRAP(thrd_equal) +TRAP(thrd_current) +TRAP(thrd_sleep) +TRAP(thrd_yield) +TRAP(thrd_exit) +TRAP(thrd_detach) +TRAP(thrd_join) + +TRAP(mtx_init) +TRAP(mtx_lock) +TRAP(mtx_timedlock) +TRAP(mtx_trylock) +TRAP(mtx_unlock) +TRAP(mtx_destroy) +TRAP(call_once) + +TRAP(cnd_init) +TRAP(cnd_signal) +TRAP(cnd_broadcast) +TRAP(cnd_wait) +TRAP(cnd_timedwait) +TRAP(cnd_destroy) + +TRAP(tss_create) +TRAP(tss_get) +TRAP(tss_set) +TRAP(tss_delete) + #endif diff --git a/contrib/librdkafka b/contrib/librdkafka index b8554f16820..81b413cc1c2 160000 --- a/contrib/librdkafka +++ b/contrib/librdkafka @@ -1 +1 @@ -Subproject commit b8554f1682062c85ba519eb54ef2f90e02b812cb +Subproject commit 81b413cc1c2a33ad4e96df856b89184efbd6221c diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index fa9d427aa33..9f8ab3eec9a 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -7,26 +7,27 @@ set -x # Thread Fuzzer allows to check more permutations of possible thread scheduling # and find more potential issues. +is_tsan_build=$(clickhouse local -q "select value like '% -fsanitize=thread %' from system.build_options where name='CXX_FLAGS'") +if [ "$is_tsan_build" -eq "0" ]; then + export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000 + export THREAD_FUZZER_SLEEP_PROBABILITY=0.1 + export THREAD_FUZZER_SLEEP_TIME_US=100000 -export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000 -export THREAD_FUZZER_SLEEP_PROBABILITY=0.1 -export THREAD_FUZZER_SLEEP_TIME_US=100000 + export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1 + export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1 + export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1 + export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1 -export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1 -export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1 -export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1 -export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1 - -export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001 -export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001 -export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001 -export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001 -export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000 - -export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000 -export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000 -export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 +fi function install_packages() { diff --git a/tests/queries/0_stateless/00984_parser_stack_overflow.sh b/tests/queries/0_stateless/00984_parser_stack_overflow.sh index 329e51e774a..7c4a6336a51 100755 --- a/tests/queries/0_stateless/00984_parser_stack_overflow.sh +++ b/tests/queries/0_stateless/00984_parser_stack_overflow.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# Tags: no-tsan +# FIXME should work with tsan # Such a huge timeout mostly for debug build. CLICKHOUSE_CURL_TIMEOUT=60 diff --git a/tests/queries/0_stateless/01172_transaction_counters.sql b/tests/queries/0_stateless/01172_transaction_counters.sql index b84a7b25c47..83bad35c40b 100644 --- a/tests/queries/0_stateless/01172_transaction_counters.sql +++ b/tests/queries/0_stateless/01172_transaction_counters.sql @@ -1,5 +1,6 @@ --- Tags: no-s3-storage +-- Tags: no-s3-storage, no-tsan -- FIXME this test fails with S3 due to a bug in DiskCacheWrapper +-- FIXME should work with tsan drop table if exists txn_counters; create table txn_counters (n Int64, creation_tid DEFAULT transactionID()) engine=MergeTree order by n; diff --git a/tests/queries/0_stateless/01183_custom_separated_format_http.sh b/tests/queries/0_stateless/01183_custom_separated_format_http.sh index 8eaa22f4ecc..0aa750a171f 100755 --- a/tests/queries/0_stateless/01183_custom_separated_format_http.sh +++ b/tests/queries/0_stateless/01183_custom_separated_format_http.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# Tags: no-tsan +# FIXME should work with tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh b/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh index 09a43d13a42..9c3a2d295f7 100755 --- a/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh +++ b/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash -# Tags: long +# Tags: long, no-tsan +# FIXME should work with tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql b/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql index 22532529812..15baa88ec4e 100644 --- a/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql +++ b/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql @@ -1,3 +1,6 @@ +-- Tags: no-tsan +-- FIXME should work with tsan + drop table if exists perf_lc_num; CREATE TABLE perf_lc_num(  num UInt8,  arr Array(LowCardinality(Int64)) default [num]  ) ENGINE = TinyLog; diff --git a/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh b/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh index e10032e04fd..e198adc2dc6 100755 --- a/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh +++ b/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash -# Tags: long, no-fasttest +# Tags: long, no-fasttest, no-tsan +# FIXME should work with tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01926_order_by_desc_limit.sql b/tests/queries/0_stateless/01926_order_by_desc_limit.sql index 86468b4fcd6..785a2e10ee3 100644 --- a/tests/queries/0_stateless/01926_order_by_desc_limit.sql +++ b/tests/queries/0_stateless/01926_order_by_desc_limit.sql @@ -1,4 +1,5 @@ --- Tags: no-random-settings +-- Tags: no-random-settings, no-tsan +-- FIXME should work with tsan DROP TABLE IF EXISTS order_by_desc; diff --git a/tests/queries/1_stateful/00159_parallel_formatting_http.sh b/tests/queries/1_stateful/00159_parallel_formatting_http.sh index ea4a4d12867..b7382c5f491 100755 --- a/tests/queries/1_stateful/00159_parallel_formatting_http.sh +++ b/tests/queries/1_stateful/00159_parallel_formatting_http.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# Tags: no-tsan +# FIXME should work with tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From ca649da97f1b9c71e1828d8d9c3329e1da2f255b Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 16 Jun 2022 12:12:01 +0200 Subject: [PATCH 15/37] better comments --- tests/queries/0_stateless/00984_parser_stack_overflow.sh | 2 +- tests/queries/0_stateless/01172_transaction_counters.sql | 2 +- tests/queries/0_stateless/01183_custom_separated_format_http.sh | 2 +- .../0_stateless/01184_long_insert_values_huge_strings.sh | 2 +- tests/queries/0_stateless/01651_lc_insert_tiny_log.sql | 2 +- .../0_stateless/01746_long_zstd_http_compression_json_format.sh | 2 +- tests/queries/0_stateless/01926_order_by_desc_limit.sql | 2 +- tests/queries/1_stateful/00159_parallel_formatting_http.sh | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/00984_parser_stack_overflow.sh b/tests/queries/0_stateless/00984_parser_stack_overflow.sh index 7c4a6336a51..168ef155d9b 100755 --- a/tests/queries/0_stateless/00984_parser_stack_overflow.sh +++ b/tests/queries/0_stateless/00984_parser_stack_overflow.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Tags: no-tsan -# FIXME should work with tsan +# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan # Such a huge timeout mostly for debug build. CLICKHOUSE_CURL_TIMEOUT=60 diff --git a/tests/queries/0_stateless/01172_transaction_counters.sql b/tests/queries/0_stateless/01172_transaction_counters.sql index 83bad35c40b..8e04b6c89bd 100644 --- a/tests/queries/0_stateless/01172_transaction_counters.sql +++ b/tests/queries/0_stateless/01172_transaction_counters.sql @@ -1,6 +1,6 @@ -- Tags: no-s3-storage, no-tsan -- FIXME this test fails with S3 due to a bug in DiskCacheWrapper --- FIXME should work with tsan +-- FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan drop table if exists txn_counters; create table txn_counters (n Int64, creation_tid DEFAULT transactionID()) engine=MergeTree order by n; diff --git a/tests/queries/0_stateless/01183_custom_separated_format_http.sh b/tests/queries/0_stateless/01183_custom_separated_format_http.sh index 0aa750a171f..744cf0c08bd 100755 --- a/tests/queries/0_stateless/01183_custom_separated_format_http.sh +++ b/tests/queries/0_stateless/01183_custom_separated_format_http.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Tags: no-tsan -# FIXME should work with tsan +# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh b/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh index 9c3a2d295f7..f4bad961f21 100755 --- a/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh +++ b/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Tags: long, no-tsan -# FIXME should work with tsan +# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql b/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql index 15baa88ec4e..ec2a1850594 100644 --- a/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql +++ b/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql @@ -1,5 +1,5 @@ -- Tags: no-tsan --- FIXME should work with tsan +-- FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan drop table if exists perf_lc_num; diff --git a/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh b/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh index e198adc2dc6..16f5211f012 100755 --- a/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh +++ b/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Tags: long, no-fasttest, no-tsan -# FIXME should work with tsan +# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01926_order_by_desc_limit.sql b/tests/queries/0_stateless/01926_order_by_desc_limit.sql index 785a2e10ee3..223dbf70fc4 100644 --- a/tests/queries/0_stateless/01926_order_by_desc_limit.sql +++ b/tests/queries/0_stateless/01926_order_by_desc_limit.sql @@ -1,5 +1,5 @@ -- Tags: no-random-settings, no-tsan --- FIXME should work with tsan +-- FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan DROP TABLE IF EXISTS order_by_desc; diff --git a/tests/queries/1_stateful/00159_parallel_formatting_http.sh b/tests/queries/1_stateful/00159_parallel_formatting_http.sh index b7382c5f491..7b949cf23e6 100755 --- a/tests/queries/1_stateful/00159_parallel_formatting_http.sh +++ b/tests/queries/1_stateful/00159_parallel_formatting_http.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # Tags: no-tsan -# FIXME should work with tsan +# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 7ed305f9b17095d07980c2018aa8bc5560eb8b48 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 16 Jun 2022 13:20:03 +0200 Subject: [PATCH 16/37] Improvement of cherry-pick/backport script - cherry_pick.py now can ba launched locally, with dry-run - get rid of fallback import paths - do not create a huge pile of objects for every sneezing - the same for hidden imports in deep local functions - improve logging - fix imports for cherry_pick_utils entities - Significantly reduced requests to GraphQL API --- tests/ci/cherry_pick.py | 86 +++++++++++------ tests/ci/cherry_pick_utils/__init__.py | 1 + tests/ci/cherry_pick_utils/backport.py | 59 ++++++------ tests/ci/cherry_pick_utils/cherrypick.py | 23 ++--- tests/ci/cherry_pick_utils/local.py | 15 +-- tests/ci/cherry_pick_utils/query.py | 116 ++++++++++++----------- 6 files changed, 171 insertions(+), 129 deletions(-) diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py index 4bbd30cd186..d9f94ffa6c7 100644 --- a/tests/ci/cherry_pick.py +++ b/tests/ci/cherry_pick.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -import sys +import argparse import logging import os import subprocess @@ -12,37 +12,61 @@ from cherry_pick_utils.backport import Backport from cherry_pick_utils.cherrypick import CherryPick +def parse_args(): + parser = argparse.ArgumentParser("Create cherry-pick and backport PRs") + parser.add_argument("--token", help="github token, if not set, used from smm") + parser.add_argument("--dry-run", action="store_true", help="do not create anything") + return parser.parse_args() + + +def main(): + args = parse_args() + token = args.token or get_parameter_from_ssm("github_robot_token_1") + + bp = Backport( + token, + os.environ.get("REPO_OWNER"), + os.environ.get("REPO_NAME"), + os.environ.get("REPO_TEAM"), + ) + + cherry_pick = CherryPick( + token, + os.environ.get("REPO_OWNER"), + os.environ.get("REPO_NAME"), + os.environ.get("REPO_TEAM"), + 1, + "master", + ) + # Use the same _gh in both objects to have a proper cost + # pylint: disable=protected-access + for key in bp._gh.api_costs: + if key in cherry_pick._gh.api_costs: + bp._gh.api_costs[key] += cherry_pick._gh.api_costs[key] + for key in cherry_pick._gh.api_costs: + if key not in bp._gh.api_costs: + bp._gh.api_costs[key] = cherry_pick._gh.api_costs[key] + cherry_pick._gh = bp._gh + # pylint: enable=protected-access + + def cherrypick_run(pr_data, branch): + cherry_pick.update_pr_branch(pr_data, branch) + return cherry_pick.execute(GITHUB_WORKSPACE, args.dry_run) + + try: + bp.execute(GITHUB_WORKSPACE, "origin", None, cherrypick_run) + except subprocess.CalledProcessError as e: + logging.error(e.output) + + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - repo_path = GITHUB_WORKSPACE - temp_path = TEMP_PATH - if not os.path.exists(temp_path): - os.makedirs(temp_path) + if not os.path.exists(TEMP_PATH): + os.makedirs(TEMP_PATH) - sys.path.append(os.path.join(repo_path, "utils/github")) - - with SSHKey("ROBOT_CLICKHOUSE_SSH_KEY"): - token = get_parameter_from_ssm("github_robot_token_1") - - bp = Backport( - token, - os.environ.get("REPO_OWNER"), - os.environ.get("REPO_NAME"), - os.environ.get("REPO_TEAM"), - ) - - def cherrypick_run(token, pr, branch): - return CherryPick( - token, - os.environ.get("REPO_OWNER"), - os.environ.get("REPO_NAME"), - os.environ.get("REPO_TEAM"), - pr, - branch, - ).execute(repo_path, False) - - try: - bp.execute(repo_path, "origin", None, cherrypick_run) - except subprocess.CalledProcessError as e: - logging.error(e.output) + if os.getenv("ROBOT_CLICKHOUSE_SSH_KEY", ""): + with SSHKey("ROBOT_CLICKHOUSE_SSH_KEY"): + main() + else: + main() diff --git a/tests/ci/cherry_pick_utils/__init__.py b/tests/ci/cherry_pick_utils/__init__.py index 40a96afc6ff..faa18be5bbf 100644 --- a/tests/ci/cherry_pick_utils/__init__.py +++ b/tests/ci/cherry_pick_utils/__init__.py @@ -1 +1,2 @@ +#!/usr/bin/env python # -*- coding: utf-8 -*- diff --git a/tests/ci/cherry_pick_utils/backport.py b/tests/ci/cherry_pick_utils/backport.py index 615c0d19ffa..9a510caf92a 100644 --- a/tests/ci/cherry_pick_utils/backport.py +++ b/tests/ci/cherry_pick_utils/backport.py @@ -1,19 +1,17 @@ # -*- coding: utf-8 -*- -try: - from clickhouse.utils.github.cherrypick import CherryPick - from clickhouse.utils.github.query import Query as RemoteRepo - from clickhouse.utils.github.local import Repository as LocalRepo -except: - from .cherrypick import CherryPick - from .query import Query as RemoteRepo - from .local import Repository as LocalRepo - import argparse import logging +import os import re import sys +sys.path.append(os.path.dirname(__file__)) + +from cherrypick import CherryPick +from query import Query as RemoteRepo +from local import Repository as LocalRepo + class Backport: def __init__(self, token, owner, name, team): @@ -49,14 +47,16 @@ class Backport: logging.info("No release branches found!") return - for branch in branches: - logging.info("Found release branch: %s", branch[0]) + logging.info( + "Found release branches: %s", ", ".join([br[0] for br in branches]) + ) if not until_commit: until_commit = branches[0][1] pull_requests = self.getPullRequests(until_commit) backport_map = {} + pr_map = {pr["number"]: pr for pr in pull_requests} RE_MUST_BACKPORT = re.compile(r"^v(\d+\.\d+)-must-backport$") RE_NO_BACKPORT = re.compile(r"^v(\d+\.\d+)-no-backport$") @@ -68,17 +68,17 @@ class Backport: pr["mergeCommit"]["oid"] ): logging.info( - "PR #{} is already inside {}. Dropping this branch for further PRs".format( - pr["number"], branches[-1][0] - ) + "PR #%s is already inside %s. Dropping this branch for further PRs", + pr["number"], + branches[-1][0], ) branches.pop() - logging.info("Processing PR #{}".format(pr["number"])) + logging.info("Processing PR #%s", pr["number"]) - assert len(branches) + assert len(branches) != 0 - branch_set = set([branch[0] for branch in branches]) + branch_set = {branch[0] for branch in branches} # First pass. Find all must-backports for label in pr["labels"]["nodes"]: @@ -120,16 +120,16 @@ class Backport: ) for pr, branches in list(backport_map.items()): - logging.info("PR #%s needs to be backported to:", pr) + statuses = [] for branch in branches: - logging.info( - "\t%s, and the status is: %s", - branch, - run_cherrypick(self._token, pr, branch), - ) + branch_status = run_cherrypick(pr_map[pr], branch) + statuses.append(f"{branch}, and the status is: {branch_status}") + logging.info( + "PR #%s needs to be backported to:\n\t%s", pr, "\n\t".join(statuses) + ) # print API costs - logging.info("\nGitHub API total costs per query:") + logging.info("\nGitHub API total costs for backporting per query:") for name, value in list(self._gh.api_costs.items()): logging.info("%s : %s", name, value) @@ -178,8 +178,13 @@ if __name__ == "__main__": else: logging.basicConfig(format="%(message)s", stream=sys.stdout, level=logging.INFO) - cherrypick_run = lambda token, pr, branch: CherryPick( - token, "ClickHouse", "ClickHouse", "core", pr, branch - ).execute(args.repo, args.dry_run) + cherry_pick = CherryPick( + args.token, "ClickHouse", "ClickHouse", "core", 1, "master" + ) + + def cherrypick_run(pr_data, branch): + cherry_pick.update_pr_branch(pr_data, branch) + return cherry_pick.execute(args.repo, args.dry_run) + bp = Backport(args.token, "ClickHouse", "ClickHouse", "core") bp.execute(args.repo, args.upstream, args.til, cherrypick_run) diff --git a/tests/ci/cherry_pick_utils/cherrypick.py b/tests/ci/cherry_pick_utils/cherrypick.py index c6469fa62a9..a6e4b9eec5b 100644 --- a/tests/ci/cherry_pick_utils/cherrypick.py +++ b/tests/ci/cherry_pick_utils/cherrypick.py @@ -14,10 +14,6 @@ Second run checks PR from previous run to be merged or at least being mergeable. Third run creates PR from backport branch (with merged previous PR) to release branch. """ -try: - from clickhouse.utils.github.query import Query as RemoteRepo -except: - from .query import Query as RemoteRepo import argparse from enum import Enum @@ -26,6 +22,10 @@ import os import subprocess import sys +sys.path.append(os.path.dirname(__file__)) + +from query import Query as RemoteRepo + class CherryPick: class Status(Enum): @@ -45,20 +45,21 @@ class CherryPick: def __init__(self, token, owner, name, team, pr_number, target_branch): self._gh = RemoteRepo(token, owner=owner, name=name, team=team) self._pr = self._gh.get_pull_request(pr_number) + self.target_branch = target_branch self.ssh_url = self._gh.ssh_url # TODO: check if pull-request is merged. + self.update_pr_branch(self._pr, self.target_branch) + def update_pr_branch(self, pr_data, target_branch): + """The method is here to avoid unnecessary creation of new objects""" + self._pr = pr_data + self.target_branch = target_branch self.merge_commit_oid = self._pr["mergeCommit"]["oid"] - self.target_branch = target_branch - self.backport_branch = "backport/{branch}/{pr}".format( - branch=target_branch, pr=pr_number - ) - self.cherrypick_branch = "cherrypick/{branch}/{oid}".format( - branch=target_branch, oid=self.merge_commit_oid - ) + self.backport_branch = f"backport/{target_branch}/{pr_data['number']}" + self.cherrypick_branch = f"cherrypick/{target_branch}/{self.merge_commit_oid}" def getCherryPickPullRequest(self): return self._gh.find_pull_request( diff --git a/tests/ci/cherry_pick_utils/local.py b/tests/ci/cherry_pick_utils/local.py index 571c9102ba0..71923b63c35 100644 --- a/tests/ci/cherry_pick_utils/local.py +++ b/tests/ci/cherry_pick_utils/local.py @@ -5,10 +5,11 @@ import logging import os import re +import git + class RepositoryBase: def __init__(self, repo_path): - import git self._repo = git.Repo(repo_path, search_parent_directories=(not repo_path)) @@ -23,22 +24,22 @@ class RepositoryBase: self.comparator = functools.cmp_to_key(cmp) - def get_head_commit(self): - return self._repo.commit(self._default) - def iterate(self, begin, end): - rev_range = "{}...{}".format(begin, end) + rev_range = f"{begin}...{end}" for commit in self._repo.iter_commits(rev_range, first_parent=True): yield commit class Repository(RepositoryBase): def __init__(self, repo_path, remote_name, default_branch_name): - super(Repository, self).__init__(repo_path) + super().__init__(repo_path) self._remote = self._repo.remotes[remote_name] self._remote.fetch() self._default = self._remote.refs[default_branch_name] + def get_head_commit(self): + return self._repo.commit(self._default) + def get_release_branches(self): """ Returns sorted list of tuples: @@ -73,7 +74,7 @@ class Repository(RepositoryBase): class BareRepository(RepositoryBase): def __init__(self, repo_path, default_branch_name): - super(BareRepository, self).__init__(repo_path) + super().__init__(repo_path) self._default = self._repo.branches[default_branch_name] def get_release_branches(self): diff --git a/tests/ci/cherry_pick_utils/query.py b/tests/ci/cherry_pick_utils/query.py index 40eb5bf3604..577a7185142 100644 --- a/tests/ci/cherry_pick_utils/query.py +++ b/tests/ci/cherry_pick_utils/query.py @@ -1,7 +1,13 @@ # -*- coding: utf-8 -*- -import requests +import json +import inspect +import logging import time +from urllib3.util.retry import Retry # type: ignore + +import requests # type: ignore +from requests.adapters import HTTPAdapter # type: ignore class Query: @@ -56,6 +62,7 @@ class Query: self._owner = owner self._name = name self._team = team + self._session = None self._max_page_size = max_page_size self._min_page_size = min_page_size @@ -129,7 +136,11 @@ class Query: next='after: "{}"'.format(result["pageInfo"]["endCursor"]), ) - members += dict([(node["login"], node["id"]) for node in result["nodes"]]) + # Update members with new nodes compatible with py3.8-py3.10 + members = { + **members, + **{node["login"]: node["id"] for node in result["nodes"]}, + } return members @@ -415,32 +426,37 @@ class Query: query = _SET_LABEL.format(pr_id=pull_request["id"], label_id=labels[0]["id"]) self._run(query, is_mutation=True) + @property + def session(self): + if self._session is not None: + return self._session + retries = 5 + self._session = requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=1, + status_forcelist=(403, 500, 502, 504), + ) + adapter = HTTPAdapter(max_retries=retry) + self._session.mount("http://", adapter) + self._session.mount("https://", adapter) + return self._session + def _run(self, query, is_mutation=False): - from requests.adapters import HTTPAdapter - from urllib3.util.retry import Retry + # Get caller and parameters from the stack to track the progress + frame = inspect.getouterframes(inspect.currentframe(), 2)[1] + caller = frame[3] + f_parameters = inspect.signature(getattr(self, caller)).parameters + parameters = ", ".join(str(frame[0].f_locals[p]) for p in f_parameters) + mutation = "" + if is_mutation: + mutation = ", is mutation" + print(f"---GraphQL request for {caller}({parameters}){mutation}---") # sleep a little, because we querying github too often - print("Request, is mutation", is_mutation) - time.sleep(0.5) - - def requests_retry_session( - retries=5, - backoff_factor=0.5, - status_forcelist=(403, 500, 502, 504), - session=None, - ): - session = session or requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount("http://", adapter) - session.mount("https://", adapter) - return session + time.sleep(0.1) headers = {"Authorization": "bearer {}".format(self._token)} if is_mutation: @@ -464,34 +480,28 @@ class Query: query=query ) - while True: - request = requests_retry_session().post( - "https://api.github.com/graphql", json={"query": query}, headers=headers - ) - if request.status_code == 200: - result = request.json() - if "errors" in result: - raise Exception( - "Errors occurred: {}\nOriginal query: {}".format( - result["errors"], query - ) - ) - - if not is_mutation: - import inspect - - caller = inspect.getouterframes(inspect.currentframe(), 2)[1][3] - if caller not in list(self.api_costs.keys()): - self.api_costs[caller] = 0 - self.api_costs[caller] += result["data"]["rateLimit"]["cost"] - - return result["data"] - else: - import json - + response = self.session.post( + "https://api.github.com/graphql", json={"query": query}, headers=headers + ) + if response.status_code == 200: + result = response.json() + if "errors" in result: raise Exception( - "Query failed with code {code}:\n{json}".format( - code=request.status_code, - json=json.dumps(request.json(), indent=4), + "Errors occurred: {}\nOriginal query: {}".format( + result["errors"], query ) ) + + if not is_mutation: + if caller not in list(self.api_costs.keys()): + self.api_costs[caller] = 0 + self.api_costs[caller] += result["data"]["rateLimit"]["cost"] + + return result["data"] + else: + raise Exception( + "Query failed with code {code}:\n{json}".format( + code=response.status_code, + json=json.dumps(response.json(), indent=4), + ) + ) From d7505d4de7283f62c8edb183da53e15d2c7646e6 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 16 Jun 2022 14:33:39 +0200 Subject: [PATCH 17/37] Change alignment, fix some f-strings --- tests/ci/cherry_pick_utils/cherrypick.py | 25 +- tests/ci/cherry_pick_utils/query.py | 362 +++++++++++------------ 2 files changed, 187 insertions(+), 200 deletions(-) diff --git a/tests/ci/cherry_pick_utils/cherrypick.py b/tests/ci/cherry_pick_utils/cherrypick.py index a6e4b9eec5b..92c87800828 100644 --- a/tests/ci/cherry_pick_utils/cherrypick.py +++ b/tests/ci/cherry_pick_utils/cherrypick.py @@ -119,17 +119,16 @@ class CherryPick: ) # Create pull-request like a local cherry-pick + title = self._pr["title"].replace('"', r"\"") pr = self._gh.create_pull_request( source=self.cherrypick_branch, target=self.backport_branch, - title="Cherry pick #{number} to {target}: {title}".format( - number=self._pr["number"], - target=self.target_branch, - title=self._pr["title"].replace('"', '\\"'), - ), - description="Original pull-request #{}\n\n{}".format( - self._pr["number"], DESCRIPTION + title=( + f'Cherry pick #{self._pr["number"]} ' + f"to {self.target_branch}: " + f"{title}" ), + description=f'Original pull-request #{self._pr["number"]}\n\n{DESCRIPTION}', ) # FIXME: use `team` to leave a single eligible assignee. @@ -166,11 +165,8 @@ class CherryPick: "user.name=robot-clickhouse", ] - pr_title = "Backport #{number} to {target}: {title}".format( - number=self._pr["number"], - target=self.target_branch, - title=self._pr["title"].replace('"', '\\"'), - ) + title = (self._pr["title"].replace('"', r"\""),) + pr_title = f"Backport #{self._pr['number']} to {self.target_branch}: {title}" self._run(git_prefix + ["checkout", "-f", self.backport_branch]) self._run(git_prefix + ["pull", "--ff-only", "origin", self.backport_branch]) @@ -204,9 +200,8 @@ class CherryPick: source=self.backport_branch, target=self.target_branch, title=pr_title, - description="Original pull-request #{}\nCherry-pick pull-request #{}\n\n{}".format( - self._pr["number"], cherrypick_pr["number"], DESCRIPTION - ), + description=f"Original pull-request #{self._pr['number']}\n" + f"Cherry-pick pull-request #{cherrypick_pr['number']}\n\n{DESCRIPTION}", ) # FIXME: use `team` to leave a single eligible assignee. diff --git a/tests/ci/cherry_pick_utils/query.py b/tests/ci/cherry_pick_utils/query.py index 577a7185142..ef1ad5ad0fb 100644 --- a/tests/ci/cherry_pick_utils/query.py +++ b/tests/ci/cherry_pick_utils/query.py @@ -16,43 +16,43 @@ class Query: """ _PULL_REQUEST = """ - author {{ - ... on User {{ - id - login - }} - }} - - baseRepository {{ - nameWithOwner - }} - - mergeCommit {{ - oid - parents(first: {min_page_size}) {{ - totalCount - nodes {{ - oid - }} - }} - }} - - mergedBy {{ - ... on User {{ - id - login - }} - }} - - baseRefName - closed - headRefName +author {{ + ... on User {{ id - mergeable - merged - number - title - url + login + }} +}} + +baseRepository {{ + nameWithOwner +}} + +mergeCommit {{ + oid + parents(first: {min_page_size}) {{ + totalCount + nodes {{ + oid + }} + }} +}} + +mergedBy {{ + ... on User {{ + id + login + }} +}} + +baseRefName +closed +headRefName +id +mergeable +merged +number +title +url """ def __init__(self, token, owner, name, team, max_page_size=100, min_page_size=10): @@ -78,13 +78,13 @@ class Query: def get_repository(self): _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - defaultBranchRef {{ - name - }} - id - sshUrl - }} +repository(owner: "{owner}" name: "{name}") {{ + defaultBranchRef {{ + name + }} + id + sshUrl +}} """ query = _QUERY.format(owner=self._owner, name=self._name) @@ -98,20 +98,20 @@ class Query: """ _QUERY = """ - organization(login: "{organization}") {{ - team(slug: "{team}") {{ - members(first: {max_page_size} {next}) {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - id - login - }} - }} - }} +organization(login: "{organization}") {{ + team(slug: "{team}") {{ + members(first: {max_page_size} {next}) {{ + pageInfo {{ + hasNextPage + endCursor }} + nodes {{ + id + login + }} + }} + }} +}} """ members = {} @@ -133,7 +133,7 @@ class Query: organization=self._owner, team=self._team, max_page_size=self._max_page_size, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), + next=f'after: "{result["pageInfo"]["endCursor"]}"', ) # Update members with new nodes compatible with py3.8-py3.10 @@ -146,11 +146,11 @@ class Query: def get_pull_request(self, number): _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequest(number: {number}) {{ - {pull_request_data} - }} - }} +repository(owner: "{owner}" name: "{name}") {{ + pullRequest(number: {number}) {{ + {pull_request_data} + }} +}} """ query = _QUERY.format( @@ -164,14 +164,16 @@ class Query: def find_pull_request(self, base, head): _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequests(first: {min_page_size} baseRefName: "{base}" headRefName: "{head}") {{ - nodes {{ - {pull_request_data} - }} - totalCount - }} - }} +repository(owner: "{owner}" name: "{name}") {{ + pullRequests( + first: {min_page_size} baseRefName: "{base}" headRefName: "{head}" + ) {{ + nodes {{ + {pull_request_data} + }} + totalCount + }} +}} """ query = _QUERY.format( @@ -193,13 +195,13 @@ class Query: Get all pull-requests filtered by label name """ _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequests(first: {min_page_size} labels: "{label_name}" states: OPEN) {{ - nodes {{ - {pull_request_data} - }} - }} - }} +repository(owner: "{owner}" name: "{name}") {{ + pullRequests(first: {min_page_size} labels: "{label_name}" states: OPEN) {{ + nodes {{ + {pull_request_data} + }} + }} +}} """ query = _QUERY.format( @@ -217,35 +219,32 @@ class Query: """ _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - defaultBranchRef {{ - target {{ - ... on Commit {{ - history(first: {max_page_size} {next}) {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - oid - associatedPullRequests(first: {min_page_size}) {{ - totalCount - nodes {{ - ... on PullRequest {{ - {pull_request_data} +repository(owner: "{owner}" name: "{name}") {{ + defaultBranchRef {{ + target {{ + ... on Commit {{ + history(first: {max_page_size} {next}) {{ + pageInfo {{ + hasNextPage + endCursor + }} + nodes {{ + oid + associatedPullRequests(first: {min_page_size}) {{ + totalCount + nodes {{ + ... on PullRequest {{ + {pull_request_data} - labels(first: {min_page_size}) {{ - totalCount - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - name - color - }} - }} - }} + labels(first: {min_page_size}) {{ + totalCount + pageInfo {{ + hasNextPage + endCursor + }} + nodes {{ + name + color }} }} }} @@ -254,6 +253,9 @@ class Query: }} }} }} + }} + }} +}} """ pull_requests = [] @@ -278,7 +280,7 @@ class Query: max_page_size=self._max_page_size, min_page_size=self._min_page_size, pull_request_data=self._PULL_REQUEST, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), + next=f'after: "{result["pageInfo"]["endCursor"]}"', ) for commit in result["nodes"]: @@ -296,7 +298,7 @@ class Query: for pull_request in commit["associatedPullRequests"]["nodes"]: if ( pull_request["baseRepository"]["nameWithOwner"] - == "{}/{}".format(self._owner, self._name) + == f"{self._owner}/{self._name}" and pull_request["baseRefName"] == self.default_branch and pull_request["mergeCommit"]["oid"] == commit["oid"] ): @@ -308,19 +310,19 @@ class Query: self, source, target, title, description="", draft=False, can_modify=True ): _QUERY = """ - createPullRequest(input: {{ - baseRefName: "{target}", - headRefName: "{source}", - repositoryId: "{id}", - title: "{title}", - body: "{body}", - draft: {draft}, - maintainerCanModify: {modify} - }}) {{ - pullRequest {{ - {pull_request_data} - }} - }} +createPullRequest(input: {{ + baseRefName: "{target}", + headRefName: "{source}", + repositoryId: "{id}", + title: "{title}", + body: "{body}", + draft: {draft}, + maintainerCanModify: {modify} +}}) {{ + pullRequest {{ + {pull_request_data} + }} +}} """ query = _QUERY.format( @@ -335,29 +337,29 @@ class Query: ) return self._run(query, is_mutation=True)["createPullRequest"]["pullRequest"] - def merge_pull_request(self, id): + def merge_pull_request(self, pr_id): _QUERY = """ - mergePullRequest(input: {{ - pullRequestId: "{id}" - }}) {{ - pullRequest {{ - {pull_request_data} - }} - }} +mergePullRequest(input: {{ + pullRequestId: "{pr_id}" +}}) {{ + pullRequest {{ + {pull_request_data} + }} +}} """ - query = _QUERY.format(id=id, pull_request_data=self._PULL_REQUEST) + query = _QUERY.format(pr_id=pr_id, pull_request_data=self._PULL_REQUEST) return self._run(query, is_mutation=True)["mergePullRequest"]["pullRequest"] # FIXME: figure out how to add more assignees at once def add_assignee(self, pr, assignee): _QUERY = """ - addAssigneesToAssignable(input: {{ - assignableId: "{id1}", - assigneeIds: "{id2}" - }}) {{ - clientMutationId - }} +addAssigneesToAssignable(input: {{ + assignableId: "{id1}", + assigneeIds: "{id2}" +}}) {{ + clientMutationId +}} """ query = _QUERY.format(id1=pr["id"], id2=assignee["id"]) @@ -373,28 +375,28 @@ class Query: """ _GET_LABEL = """ - repository(owner: "{owner}" name: "{name}") {{ - labels(first: {max_page_size} {next} query: "{label_name}") {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - id - name - color - }} - }} - }} +repository(owner: "{owner}" name: "{name}") {{ + labels(first: {max_page_size} {next} query: "{label_name}") {{ + pageInfo {{ + hasNextPage + endCursor + }} + nodes {{ + id + name + color + }} + }} +}} """ _SET_LABEL = """ - addLabelsToLabelable(input: {{ - labelableId: "{pr_id}", - labelIds: "{label_id}" - }}) {{ - clientMutationId - }} +addLabelsToLabelable(input: {{ + labelableId: "{pr_id}", + labelIds: "{label_id}" +}}) {{ + clientMutationId +}} """ labels = [] @@ -415,10 +417,10 @@ class Query: name=self._name, label_name=label_name, max_page_size=self._max_page_size, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), + next=f'after: "{result["pageInfo"]["endCursor"]}"', ) - labels += [label for label in result["nodes"]] + labels += list(result["nodes"]) if not labels: return @@ -458,27 +460,23 @@ class Query: # sleep a little, because we querying github too often time.sleep(0.1) - headers = {"Authorization": "bearer {}".format(self._token)} + headers = {"Authorization": f"bearer {self._token}"} if is_mutation: - query = """ - mutation {{ - {query} - }} - """.format( - query=query - ) + query = f""" +mutation {{ + {query} +}} + """ else: - query = """ - query {{ - {query} - rateLimit {{ - cost - remaining - }} - }} - """.format( - query=query - ) + query = f""" +query {{ + {query} + rateLimit {{ + cost + remaining + }} +}} + """ response = self.session.post( "https://api.github.com/graphql", json={"query": query}, headers=headers @@ -487,21 +485,15 @@ class Query: result = response.json() if "errors" in result: raise Exception( - "Errors occurred: {}\nOriginal query: {}".format( - result["errors"], query - ) + f"Errors occurred: {result['errors']}\nOriginal query: {query}" ) if not is_mutation: - if caller not in list(self.api_costs.keys()): + if caller not in self.api_costs: self.api_costs[caller] = 0 self.api_costs[caller] += result["data"]["rateLimit"]["cost"] return result["data"] else: - raise Exception( - "Query failed with code {code}:\n{json}".format( - code=response.status_code, - json=json.dumps(response.json(), indent=4), - ) - ) + data = json.dumps(response.json(), indent=4) + raise Exception(f"Query failed with code {response.status_code}:\n{data}") From d14a2d3583e606601cfbaadf3a26bdcfb2bac767 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:47:15 +0200 Subject: [PATCH 18/37] Fix docs about encryption functions --- .../functions/encryption-functions.md | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/docs/en/sql-reference/functions/encryption-functions.md b/docs/en/sql-reference/functions/encryption-functions.md index fb821ca7783..06995cefaee 100644 --- a/docs/en/sql-reference/functions/encryption-functions.md +++ b/docs/en/sql-reference/functions/encryption-functions.md @@ -19,11 +19,10 @@ This function encrypts data using these modes: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb - aes-128-gcm, aes-192-gcm, aes-256-gcm +- aes-128-ctr, aes-192-ctr, aes-256-ctr **Syntax** @@ -63,9 +62,9 @@ Insert some data (please avoid storing the keys/ivs in the database as this unde Query: ``` sql -INSERT INTO encryption_test VALUES('aes-256-cfb128 no IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212')),\ -('aes-256-cfb128 no IV, different key', encrypt('aes-256-cfb128', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\ -('aes-256-cfb128 with IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\ +INSERT INTO encryption_test VALUES('aes-256-ofb no IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212')),\ +('aes-256-ofb no IV, different key', encrypt('aes-256-ofb', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\ +('aes-256-ofb with IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\ ('aes-256-cbc no IV', encrypt('aes-256-cbc', 'Secret', '12345678910121314151617181920212')); ``` @@ -78,12 +77,12 @@ SELECT comment, hex(secret) FROM encryption_test; Result: ``` text -┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ -│ aes-256-cfb128 no IV │ B4972BDC4459 │ -│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ -│ aes-256-cfb128 with IV │ 5E6CB398F653 │ -│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ -└─────────────────────────────────────┴──────────────────────────────────┘ +┌─comment──────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-ofb no IV │ B4972BDC4459 │ +│ aes-256-ofb no IV, different key │ 2FF57C092DC9 │ +│ aes-256-ofb with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└──────────────────────────────────┴──────────────────────────────────┘ ``` Example with `-gcm`: @@ -116,9 +115,7 @@ Supported encryption modes: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb **Syntax** @@ -145,7 +142,7 @@ Given equal input `encrypt` and `aes_encrypt_mysql` produce the same ciphertext: Query: ``` sql -SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal; +SELECT encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal; ``` Result: @@ -161,14 +158,14 @@ But `encrypt` fails when `key` or `iv` is longer than expected: Query: ``` sql -SELECT encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'); +SELECT encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'); ``` Result: ``` text -Received exception from server (version 21.1.2): -Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). +Received exception from server (version 22.6.1): +Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). ``` While `aes_encrypt_mysql` produces MySQL-compatitalbe output: @@ -176,7 +173,7 @@ While `aes_encrypt_mysql` produces MySQL-compatitalbe output: Query: ``` sql -SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext; +SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext; ``` Result: @@ -192,7 +189,7 @@ Notice how supplying even longer `IV` produces the same result Query: ``` sql -SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext +SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext ``` Result: @@ -224,11 +221,10 @@ This function decrypts ciphertext into a plaintext using these modes: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb - aes-128-gcm, aes-192-gcm, aes-256-gcm +- aes-128-ctr, aes-192-ctr, aes-256-ctr **Syntax** @@ -265,12 +261,12 @@ Result: │ aes-256-gcm │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │ │ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │ └──────────────────────┴──────────────────────────────────────────────┘ -┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ -│ aes-256-cfb128 no IV │ B4972BDC4459 │ -│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ -│ aes-256-cfb128 with IV │ 5E6CB398F653 │ -│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ -└─────────────────────────────────────┴──────────────────────────────────┘ +┌─comment──────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-ofb no IV │ B4972BDC4459 │ +│ aes-256-ofb no IV, different key │ 2FF57C092DC9 │ +│ aes-256-ofb with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└──────────────────────────────────┴──────────────────────────────────┘ ``` Now let's try to decrypt all that data. @@ -284,13 +280,19 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920 Result: ``` text -┌─comment─────────────────────────────┬─plaintext─┐ -│ aes-256-cfb128 no IV │ Secret │ -│ aes-256-cfb128 no IV, different key │ �4� - � │ -│ aes-256-cfb128 with IV │ ���6�~ │ - │aes-256-cbc no IV │ �2*4�h3c�4w��@ -└─────────────────────────────────────┴───────────┘ +┌─comment──────────────┬─plaintext──┐ +│ aes-256-gcm │ OQ�E + �t�7T�\���\� │ +│ aes-256-gcm with AAD │ OQ�E + �\��si����;�o�� │ +└──────────────────────┴────────────┘ +┌─comment──────────────────────────┬─plaintext─┐ +│ aes-256-ofb no IV │ Secret │ +│ aes-256-ofb no IV, different key │ �4� + � │ +│ aes-256-ofb with IV │ ���6�~ │ + │aes-256-cbc no IV │ �2*4�h3c�4w��@ +└──────────────────────────────────┴───────────┘ ``` Notice how only a portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption. @@ -305,9 +307,7 @@ Supported decryption modes: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb **Syntax** @@ -347,7 +347,7 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv Query: ``` sql -SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext +SELECT aes_decrypt_mysql('aes-256-ofb', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext ``` Result: From 023e7132fc8a836542b0acb6b17cf23ba2c2f0c9 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:50:35 +0200 Subject: [PATCH 19/37] Fix --- docs/en/sql-reference/functions/encryption-functions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/encryption-functions.md b/docs/en/sql-reference/functions/encryption-functions.md index 06995cefaee..58a1d9d56f8 100644 --- a/docs/en/sql-reference/functions/encryption-functions.md +++ b/docs/en/sql-reference/functions/encryption-functions.md @@ -203,7 +203,7 @@ Result: Which is binary equal to what MySQL produces on same inputs: ``` sql -mysql> SET block_encryption_mode='aes-256-cfb128'; +mysql> SET block_encryption_mode='aes-256-ofb'; Query OK, 0 rows affected (0.00 sec) mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; @@ -332,7 +332,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) Let's decrypt data we've previously encrypted with MySQL: ``` sql -mysql> SET block_encryption_mode='aes-256-cfb128'; +mysql> SET block_encryption_mode='aes-256-ofb'; Query OK, 0 rows affected (0.00 sec) mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; From 489765aa6d97c5eaed56310c1c172f2d46febf17 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 16 Jun 2022 14:51:49 +0200 Subject: [PATCH 20/37] Fix ru docs --- .../functions/encryption-functions.md | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 2eaad0e1930..9a8e09348f9 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -19,11 +19,10 @@ sidebar_label: "Функции для шифрования" - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb - aes-128-gcm, aes-192-gcm, aes-256-gcm +- aes-128-ctr, aes-192-ctr, aes-256-ctr **Синтаксис** @@ -63,9 +62,9 @@ ENGINE = Memory; Запрос: ``` sql -INSERT INTO encryption_test VALUES('aes-256-cfb128 no IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212')),\ -('aes-256-cfb128 no IV, different key', encrypt('aes-256-cfb128', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\ -('aes-256-cfb128 with IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\ +INSERT INTO encryption_test VALUES('aes-256-ofb no IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212')),\ +('aes-256-ofb no IV, different key', encrypt('aes-256-ofb', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\ +('aes-256-ofb with IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\ ('aes-256-cbc no IV', encrypt('aes-256-cbc', 'Secret', '12345678910121314151617181920212')); ``` @@ -78,12 +77,12 @@ SELECT comment, hex(secret) FROM encryption_test; Результат: ``` text -┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ -│ aes-256-cfb128 no IV │ B4972BDC4459 │ -│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ -│ aes-256-cfb128 with IV │ 5E6CB398F653 │ -│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ -└─────────────────────────────────────┴──────────────────────────────────┘ +┌─comment──────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-ofb no IV │ B4972BDC4459 │ +│ aes-256-ofb no IV, different key │ 2FF57C092DC9 │ +│ aes-256-ofb with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└──────────────────────────────────┴──────────────────────────────────┘ ``` Пример в режиме `-gcm`: @@ -116,9 +115,7 @@ SELECT comment, hex(secret) FROM encryption_test WHERE comment LIKE '%gcm%'; - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb **Синтаксис** @@ -145,7 +142,7 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv]) Запрос: ``` sql -SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal; +SELECT encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal; ``` Результат: @@ -161,14 +158,14 @@ SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', ' Запрос: ``` sql -SELECT encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'); +SELECT encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'); ``` Результат: ``` text Received exception from server (version 21.1.2): -Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). +Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). ``` Однако функция `aes_encrypt_mysql` в аналогичном случае возвращает результат, который может быть обработан MySQL: @@ -176,7 +173,7 @@ Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid ke Запрос: ``` sql -SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext; +SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext; ``` Результат: @@ -192,7 +189,7 @@ SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '1234567891012131415161 Запрос: ``` sql -SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext +SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext ``` Результат: @@ -206,7 +203,7 @@ SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '1234567891012131415161 Это совпадает с результатом, возвращаемым MySQL при таких же входящих значениях: ``` sql -mysql> SET block_encryption_mode='aes-256-cfb128'; +mysql> SET block_encryption_mode='aes-256-ofb'; Query OK, 0 rows affected (0.00 sec) mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; @@ -224,11 +221,10 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb - aes-128-gcm, aes-192-gcm, aes-256-gcm +- aes-128-ctr, aes-192-ctr, aes-256-ctr **Синтаксис** @@ -265,12 +261,12 @@ SELECT comment, hex(secret) FROM encryption_test; │ aes-256-gcm │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │ │ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │ └──────────────────────┴──────────────────────────────────────────────┘ -┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ -│ aes-256-cfb128 no IV │ B4972BDC4459 │ -│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ -│ aes-256-cfb128 with IV │ 5E6CB398F653 │ -│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ -└─────────────────────────────────────┴──────────────────────────────────┘ +┌─comment──────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-ofb no IV │ B4972BDC4459 │ +│ aes-256-ofb no IV, different key │ 2FF57C092DC9 │ +│ aes-256-ofb with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└──────────────────────────────────┴──────────────────────────────────┘ ``` Теперь попытаемся расшифровать эти данные: @@ -278,19 +274,25 @@ SELECT comment, hex(secret) FROM encryption_test; Запрос: ``` sql -SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test; +SELECT comment, decrypt('aes-256-ofb', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test; ``` Результат: ``` text -┌─comment─────────────────────────────┬─plaintext─┐ -│ aes-256-cfb128 no IV │ Secret │ -│ aes-256-cfb128 no IV, different key │ �4� - � │ -│ aes-256-cfb128 with IV │ ���6�~ │ - │aes-256-cbc no IV │ �2*4�h3c�4w��@ -└─────────────────────────────────────┴───────────┘ +┌─comment──────────────┬─plaintext──┐ +│ aes-256-gcm │ OQ�E + �t�7T�\���\� │ +│ aes-256-gcm with AAD │ OQ�E + �\��si����;�o�� │ +└──────────────────────┴────────────┘ +┌─comment──────────────────────────┬─plaintext─┐ +│ aes-256-ofb no IV │ Secret │ +│ aes-256-ofb no IV, different key │ �4� + � │ +│ aes-256-ofb with IV │ ���6�~ │ + │aes-256-cbc no IV │ �2*4�h3c�4w��@ +└──────────────────────────────────┴───────────┘ ``` Обратите внимание, что только часть данных была расшифрована верно. Оставшаяся часть расшифрована некорректно, так как при шифровании использовались другие значения `mode`, `key`, или `iv`. @@ -305,9 +307,7 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920 - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb **Синтаксис** @@ -333,7 +333,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) ``` sql -mysql> SET block_encryption_mode='aes-256-cfb128'; +mysql> SET block_encryption_mode='aes-256-ofb'; Query OK, 0 rows affected (0.00 sec) mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; @@ -348,7 +348,7 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv Запрос: ``` sql -SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext; +SELECT aes_decrypt_mysql('aes-256-ofb', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext; ``` Результат: From 5f3de6f48650bbfb5792fee4fdb00a902e24be71 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 16 Jun 2022 16:27:49 +0200 Subject: [PATCH 21/37] Use the best token instead of the constant --- tests/ci/cherry_pick.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py index d9f94ffa6c7..745284b2b29 100644 --- a/tests/ci/cherry_pick.py +++ b/tests/ci/cherry_pick.py @@ -6,7 +6,7 @@ import os import subprocess from env_helper import GITHUB_WORKSPACE, TEMP_PATH -from get_robot_token import get_parameter_from_ssm +from get_robot_token import get_best_robot_token from ssh import SSHKey from cherry_pick_utils.backport import Backport from cherry_pick_utils.cherrypick import CherryPick @@ -21,7 +21,7 @@ def parse_args(): def main(): args = parse_args() - token = args.token or get_parameter_from_ssm("github_robot_token_1") + token = args.token or get_best_robot_token() bp = Backport( token, From 2544b0d53eb1f7d1c4acb8a6d37903adf149dccb Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 16 Jun 2022 16:29:48 +0200 Subject: [PATCH 22/37] Add retries with progressive sleep and dynamic page sice decreasing --- tests/ci/cherry_pick_utils/backport.py | 2 +- tests/ci/cherry_pick_utils/query.py | 69 +++++++++++++++++++------- 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/tests/ci/cherry_pick_utils/backport.py b/tests/ci/cherry_pick_utils/backport.py index 9a510caf92a..1bc910886de 100644 --- a/tests/ci/cherry_pick_utils/backport.py +++ b/tests/ci/cherry_pick_utils/backport.py @@ -16,7 +16,7 @@ from local import Repository as LocalRepo class Backport: def __init__(self, token, owner, name, team): self._gh = RemoteRepo( - token, owner=owner, name=name, team=team, max_page_size=30, min_page_size=7 + token, owner=owner, name=name, team=team, max_page_size=60, min_page_size=7 ) self._token = token self.default_branch_name = self._gh.default_branch diff --git a/tests/ci/cherry_pick_utils/query.py b/tests/ci/cherry_pick_utils/query.py index ef1ad5ad0fb..917f9901287 100644 --- a/tests/ci/cherry_pick_utils/query.py +++ b/tests/ci/cherry_pick_utils/query.py @@ -457,9 +457,6 @@ addLabelsToLabelable(input: {{ mutation = ", is mutation" print(f"---GraphQL request for {caller}({parameters}){mutation}---") - # sleep a little, because we querying github too often - time.sleep(0.1) - headers = {"Authorization": f"bearer {self._token}"} if is_mutation: query = f""" @@ -478,22 +475,58 @@ query {{ }} """ - response = self.session.post( - "https://api.github.com/graphql", json={"query": query}, headers=headers - ) - if response.status_code == 200: - result = response.json() - if "errors" in result: - raise Exception( - f"Errors occurred: {result['errors']}\nOriginal query: {query}" + def request_with_retry(retry=0): + max_retries = 5 + # From time to time we face some concrete errors, when it worth to + # retry instead of failing competely + # We should sleep progressively + progressive_sleep = 5 * sum(i + 1 for i in range(retry)) + if progressive_sleep: + logging.warning( + "Retry GraphQL request %s time, sleep %s seconds", + retry, + progressive_sleep, ) + time.sleep(progressive_sleep) + response = self.session.post( + "https://api.github.com/graphql", json={"query": query}, headers=headers + ) + result = response.json() + if response.status_code == 200: + if "errors" in result: + raise Exception( + f"Errors occurred: {result['errors']}\nOriginal query: {query}" + ) - if not is_mutation: - if caller not in self.api_costs: - self.api_costs[caller] = 0 - self.api_costs[caller] += result["data"]["rateLimit"]["cost"] + if not is_mutation: + if caller not in self.api_costs: + self.api_costs[caller] = 0 + self.api_costs[caller] += result["data"]["rateLimit"]["cost"] - return result["data"] - else: - data = json.dumps(response.json(), indent=4) + return result["data"] + elif ( + response.status_code == 403 + and "secondary rate limit" in result["message"] + ): + if retry <= max_retries: + logging.warning("Secondary rate limit reached") + return request_with_retry(retry + 1) + elif response.status_code == 502 and "errors" in result: + too_many_data = any( + True + for err in result["errors"] + if "message" in err + and "This may be the result of a timeout" in err["message"] + ) + if too_many_data: + logging.warning( + "Too many data is requested, decreasing page size %s by 10%%", + self._max_page_size, + ) + self._max_page_size = int(self._max_page_size * 0.9) + return request_with_retry(retry) + + data = json.dumps(result, indent=4) raise Exception(f"Query failed with code {response.status_code}:\n{data}") + + return request_with_retry() From e5ceaf3c0f61f59859f8c76c651218bb03e3e999 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 16 Jun 2022 17:49:26 +0200 Subject: [PATCH 23/37] Launch cherry-pick/backport job on graviton --- .github/workflows/backport.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/backport.yml b/.github/workflows/backport.yml index 66dddbee640..da42bbae78a 100644 --- a/.github/workflows/backport.yml +++ b/.github/workflows/backport.yml @@ -13,7 +13,7 @@ on: # yamllint disable-line rule:truthy jobs: CherryPick: - runs-on: [self-hosted, style-checker] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Set envs # https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings From 2a9942ee809c1eaafefa5a3fef24d35e45cb560a Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 16 Jun 2022 15:50:03 +0000 Subject: [PATCH 24/37] Add setting multiple_joins_try_to_keep_original_names --- src/Core/Settings.h | 1 + src/Interpreters/InterpreterSelectQuery.cpp | 2 ++ src/Interpreters/JoinToSubqueryTransformVisitor.cpp | 11 +++++++++-- src/Interpreters/JoinToSubqueryTransformVisitor.h | 1 + 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index e3f756c85f5..4115061c3df 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -600,6 +600,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \ M(Bool, throw_if_no_data_to_insert, true, "Enables or disables empty INSERTs, enabled by default", 0) \ M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \ + M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 94ac7c26183..ec7c3878b06 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -232,6 +232,8 @@ static void rewriteMultipleJoins(ASTPtr & query, const TablesWithColumns & table CrossToInnerJoinVisitor(cross_to_inner).visit(query); JoinToSubqueryTransformVisitor::Data join_to_subs_data{tables, aliases}; + join_to_subs_data.try_to_keep_original_names = settings.multiple_joins_try_to_keep_original_names; + JoinToSubqueryTransformVisitor(join_to_subs_data).visit(query); } diff --git a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp index a3ffaafa4db..e07430c0feb 100644 --- a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp +++ b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp @@ -361,6 +361,7 @@ struct CheckAliasDependencyVisitorData dependency = &ident; } }; + using CheckAliasDependencyMatcher = OneTypeMatcher; using CheckAliasDependencyVisitor = InDepthNodeVisitor; @@ -500,6 +501,7 @@ void restoreName(ASTIdentifier & ident, const String & original_name, NameSet & { if (!ident.tryGetAlias().empty()) return; + if (original_name.empty()) return; @@ -509,7 +511,9 @@ void restoreName(ASTIdentifier & ident, const String & original_name, NameSet & restored_names.emplace(original_name); } else + { ident.setShortName(original_name); + } } /// Find clashes and normalize names @@ -527,12 +531,12 @@ std::vector normalizeColumnNamesExtractNeeded( { size_t last_table_pos = tables.size() - 1; - NameSet restored_names; std::vector needed_columns; needed_columns.reserve(tables.size()); for (const auto & table : tables) needed_columns.push_back(TableNeededColumns{table.table}); + NameSet restored_names; for (ASTIdentifier * ident : identifiers) { bool got_alias = aliases.contains(ident->name()); @@ -729,7 +733,10 @@ void JoinToSubqueryTransformMatcher::visit(ASTSelectQuery & select, ASTPtr & ast std::unordered_set public_identifiers; for (auto & top_level_child : select.select()->children) if (auto * ident = top_level_child->as()) - public_identifiers.insert(ident); + { + if (!data.try_to_keep_original_names || startsWith(ident->name(), UniqueShortNames::pattern)) + public_identifiers.insert(ident); + } UniqueShortNames unique_names; std::vector needed_columns = diff --git a/src/Interpreters/JoinToSubqueryTransformVisitor.h b/src/Interpreters/JoinToSubqueryTransformVisitor.h index a024a168509..96420512ae6 100644 --- a/src/Interpreters/JoinToSubqueryTransformVisitor.h +++ b/src/Interpreters/JoinToSubqueryTransformVisitor.h @@ -21,6 +21,7 @@ public: const std::vector & tables; const Aliases & aliases; bool done = false; + bool try_to_keep_original_names = false; }; static bool needChildVisit(ASTPtr &, const ASTPtr &); From 15988a220b5463b0f5c485e653bc546a336aa4ce Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 16 Jun 2022 15:52:55 +0000 Subject: [PATCH 25/37] Add test multiple_joins_original_names --- ...37_multiple_joins_original_names.reference | 2 ++ .../02337_multiple_joins_original_names.sql | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 tests/queries/0_stateless/02337_multiple_joins_original_names.reference create mode 100644 tests/queries/0_stateless/02337_multiple_joins_original_names.sql diff --git a/tests/queries/0_stateless/02337_multiple_joins_original_names.reference b/tests/queries/0_stateless/02337_multiple_joins_original_names.reference new file mode 100644 index 00000000000..6ed281c757a --- /dev/null +++ b/tests/queries/0_stateless/02337_multiple_joins_original_names.reference @@ -0,0 +1,2 @@ +1 +1 diff --git a/tests/queries/0_stateless/02337_multiple_joins_original_names.sql b/tests/queries/0_stateless/02337_multiple_joins_original_names.sql new file mode 100644 index 00000000000..afafee9f8eb --- /dev/null +++ b/tests/queries/0_stateless/02337_multiple_joins_original_names.sql @@ -0,0 +1,22 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/34697 + +SELECT table1_id FROM ( + SELECT first.table1_id + FROM (SELECT number+1 as table1_id FROM numbers(1)) as first + JOIN (SELECT number+1 as table2_id FROM numbers(1)) as second ON first.table1_id = second.table2_id + JOIN (SELECT number+1 as table3_id FROM numbers(1)) as third ON first.table1_id = third.table3_id +); -- { serverError UNKNOWN_IDENTIFIER } + +SELECT table1_id FROM ( + SELECT first.table1_id + FROM (SELECT number+1 as table1_id FROM numbers(1)) as first + JOIN (SELECT number+1 as table2_id FROM numbers(1)) as second ON first.table1_id = second.table2_id + JOIN (SELECT number+1 as table3_id FROM numbers(1)) as third ON first.table1_id = third.table3_id +) SETTINGS multiple_joins_try_to_keep_original_names = 1; + +SELECT aaa FROM ( + SELECT first.table1_id as aaa + FROM (SELECT number+1 as table1_id FROM numbers(1)) as first + JOIN (SELECT number+1 as table2_id FROM numbers(1)) as second ON first.table1_id = second.table2_id + JOIN (SELECT number+1 as table3_id FROM numbers(1)) as third ON first.table1_id = third.table3_id +) SETTINGS multiple_joins_try_to_keep_original_names = 1; From 6fdcac1c9d109b3f9bddffc9b5e652d5fcd3bde8 Mon Sep 17 00:00:00 2001 From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com> Date: Thu, 16 Jun 2022 19:47:11 +0000 Subject: [PATCH 26/37] Remove processor description from span attributes - it is not working anyway. --- src/Processors/Executors/ExecutionThreadContext.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Processors/Executors/ExecutionThreadContext.cpp b/src/Processors/Executors/ExecutionThreadContext.cpp index d77fe6138cd..5a5c1826c61 100644 --- a/src/Processors/Executors/ExecutionThreadContext.cpp +++ b/src/Processors/Executors/ExecutionThreadContext.cpp @@ -103,7 +103,6 @@ bool ExecutionThreadContext::executeTask() #endif span.addAttribute("thread_number", thread_number); - span.addAttribute("processor.description", node->processor->getDescription()); return node->exception == nullptr; } From 00aca924d0586b41b751ed6536c47653494009a8 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 16 Jun 2022 22:45:28 +0000 Subject: [PATCH 27/37] Update GROUP BY clause docs --- .../statements/select/group-by.md | 53 ++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/statements/select/group-by.md b/docs/en/sql-reference/statements/select/group-by.md index 45230d0b3b1..e02db6d4f6b 100644 --- a/docs/en/sql-reference/statements/select/group-by.md +++ b/docs/en/sql-reference/statements/select/group-by.md @@ -48,9 +48,9 @@ You can see that `GROUP BY` for `y = NULL` summed up `x`, as if `NULL` is this v If you pass several keys to `GROUP BY`, the result will give you all the combinations of the selection, as if `NULL` were a specific value. -## WITH ROLLUP Modifier +## ROLLUP Modifier -`WITH ROLLUP` modifier is used to calculate subtotals for the key expressions, based on their order in the `GROUP BY` list. The subtotals rows are added after the result table. +`ROLLUP` modifier is used to calculate subtotals for the key expressions, based on their order in the `GROUP BY` list. The subtotals rows are added after the result table. The subtotals are calculated in the reverse order: at first subtotals are calculated for the last key expression in the list, then for the previous one, and so on up to the first key expression. @@ -78,7 +78,7 @@ Consider the table t: Query: ```sql -SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH ROLLUP; +SELECT year, month, day, count(*) FROM t GROUP BY ROLLUP(year, month, day); ``` As `GROUP BY` section has three key expressions, the result contains four tables with subtotals "rolled up" from right to left: @@ -109,10 +109,14 @@ As `GROUP BY` section has three key expressions, the result contains four tables │ 0 │ 0 │ 0 │ 6 │ └──────┴───────┴─────┴─────────┘ ``` +The same query also can be written using `WITH` keyword. +```sql +SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH ROLLUP; +``` -## WITH CUBE Modifier +## CUBE Modifier -`WITH CUBE` modifier is used to calculate subtotals for every combination of the key expressions in the `GROUP BY` list. The subtotals rows are added after the result table. +`CUBE` modifier is used to calculate subtotals for every combination of the key expressions in the `GROUP BY` list. The subtotals rows are added after the result table. In the subtotals rows the values of all "grouped" key expressions are set to `0` or empty line. @@ -138,7 +142,7 @@ Consider the table t: Query: ```sql -SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH CUBE; +SELECT year, month, day, count(*) FROM t GROUP BY CUBE(year, month, day); ``` As `GROUP BY` section has three key expressions, the result contains eight tables with subtotals for all key expression combinations: @@ -196,6 +200,10 @@ Columns, excluded from `GROUP BY`, are filled with zeros. │ 0 │ 0 │ 0 │ 6 │ └──────┴───────┴─────┴─────────┘ ``` +The same query also can be written using `WITH` keyword. +```sql +SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH CUBE; +``` ## WITH TOTALS Modifier @@ -260,6 +268,39 @@ GROUP BY domain For every different key value encountered, `GROUP BY` calculates a set of aggregate function values. +## GROUPING SETS modifier + +This is the most general modifier. +This modifier allows to manually specify several aggregation key sets (grouping sets). +Aggregation is performed separately for each grouping set, after that all results are combined. +If a column is not presented in a grouping set, it's filled with a default value. + +In other words, modifiers described above can be represented via `GROUPING SETS`. +Despite the fact that queries with `ROLLUP`, `CUBE` and `GROUPING SETS` modifiers are syntactically equal, they may have different performance. +When `GROUPING SETS` try to execute everything in parallel, `ROLLUP` and `CUBE` are executing the final merging of the aggregates in a single thread. + +In the situation when source columns contain default values, it might be hard to distinguish if a row is a part of the aggregation which uses those columns as keys or not. +To solve this problem `GROUPING` function must be used. + +**Example** + +The following two queries are equivalent. + +```sql +-- Query 1 +SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH ROLLUP; + +-- Query 2 +SELECT year, month, day, count(*) FROM t GROUP BY +GROUPING SETS +( + (year, month, day), + (year, month), + (year), + () +); +``` + ## Implementation Details Aggregation is one of the most important features of a column-oriented DBMS, and thus it’s implementation is one of the most heavily optimized parts of ClickHouse. By default, aggregation is done in memory using a hash-table. It has 40+ specializations that are chosen automatically depending on “grouping key” data types. From 26b3dd2d9e2de87ea721338b9511c5f786e4be6a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 17 Jun 2022 13:41:58 +0200 Subject: [PATCH 28/37] Fix description --- docs/en/sql-reference/functions/string-search-functions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 8d35204d783..0ecf980c163 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -354,7 +354,7 @@ Checks whether the string matches the regular expression `pattern` in `re2` synt Returns 0 if it does not match, or 1 if it matches. -Matching is based on UTF-8, e.g. `.` matches the two-codepoint symbol `¥`. The regular expression must not contain null bytes. +Matching is based on UTF-8, e.g. `.` matches the Unicode code point `¥` which is stored in UTF-8 using two bytes. The regular expression must not contain null bytes. For patterns to search for substrings in a string, it is better to use LIKE or ‘position’, since they work much faster. @@ -499,7 +499,7 @@ The regular expression can contain the metasymbols `%` and `_`. Use the backslash (`\`) for escaping metasymbols. See the note on escaping in the description of the ‘match’ function. -Matching is based on UTF-8, e.g. `_` matches the two-codepoint symbol `¥`. +Matching is based on UTF-8, e.g. `_` matches the Unicode code point `¥` which is stored in UTF-8 using two bytes. For regular expressions like `%needle%`, the code is more optimal and works as fast as the `position` function. For other regular expressions, the code is the same as for the ‘match’ function. From ba0761fb6a9334ce18e20b972757765a79baab47 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 17 Jun 2022 15:26:59 +0200 Subject: [PATCH 29/37] More details on matching with Unicode --- .../functions/string-functions.md | 8 +++--- .../functions/string-search-functions.md | 26 ++++++++++++------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 0c1c738f663..0ab105c79d6 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -273,16 +273,16 @@ Converts ASCII Latin symbols in a string to uppercase. ## lowerUTF8 {#lowerutf8} Converts a string to lowercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. -It does not detect the language. So for Turkish the result might not be exactly correct. +It does not detect the language. E.g. for Turkish the result might not be exactly correct (i/İ vs. i/I). If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point. -If the string contains a set of bytes that is not UTF-8, then the behavior is undefined. +If the string contains a sequence of bytes that are not valid UTF-8, then the behavior is undefined. ## upperUTF8 {#upperutf8} Converts a string to uppercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. -It does not detect the language. So for Turkish the result might not be exactly correct. +It does not detect the language. E.g. for Turkish the result might not be exactly correct (i/İ vs. i/I). If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point. -If the string contains a set of bytes that is not UTF-8, then the behavior is undefined. +If the string contains a sequence of bytes that are not valid UTF-8, then the behavior is undefined. ## isValidUTF8 {#isvalidutf8} diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 0ecf980c163..fd9022b6549 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -7,7 +7,7 @@ sidebar_label: For Searching in Strings The search is case-sensitive by default in all these functions. There are separate variants for case insensitive search. -:::note +:::note Functions for [replacing](../../sql-reference/functions/string-replace-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. ::: @@ -31,7 +31,7 @@ position(needle IN haystack) Alias: `locate(haystack, needle[, start_pos])`. -:::note +:::note Syntax of `position(needle IN haystack)` provides SQL-compatibility, the function works the same way as to `position(haystack, needle)`. ::: @@ -344,7 +344,7 @@ Returns 1, if at least one string needlei matches the string `haystac For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. -:::note +:::note In all `multiSearch*` functions the number of needles should be less than 28 because of implementation specification. ::: @@ -354,7 +354,9 @@ Checks whether the string matches the regular expression `pattern` in `re2` synt Returns 0 if it does not match, or 1 if it matches. -Matching is based on UTF-8, e.g. `.` matches the Unicode code point `¥` which is stored in UTF-8 using two bytes. The regular expression must not contain null bytes. +Matching is based on UTF-8, e.g. `.` matches the Unicode code point `¥` which is represented in UTF-8 using two bytes. The regular expression must not contain null bytes. +If the haystack or pattern contain a sequence of bytes that are not valid UTF-8, then the behavior is undefined. +No automatic Unicode normalization is performed, if you need it you can use the [normalizeUTF8*()](https://clickhouse.com/docs/en/sql-reference/functions/string-functions/) functions for that. For patterns to search for substrings in a string, it is better to use LIKE or ‘position’, since they work much faster. @@ -362,7 +364,7 @@ For patterns to search for substrings in a string, it is better to use LIKE or The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster. -:::note +:::note The length of any of the `haystack` string must be less than 232 bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API. ::: @@ -386,11 +388,11 @@ The same as `multiFuzzyMatchAny`, but returns any index that matches the haystac The same as `multiFuzzyMatchAny`, but returns the array of all indices in any order that match the haystack within a constant edit distance. -:::note +:::note `multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction. ::: -:::note +:::note To turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`. ::: @@ -406,7 +408,7 @@ Extracts all the fragments of a string using a regular expression. If ‘haystac Matches all groups of the `haystack` string using the `pattern` regular expression. Returns an array of arrays, where the first array includes all fragments matching the first group, the second array - matching the second group, etc. -:::note +:::note `extractAllGroupsHorizontal` function is slower than [extractAllGroupsVertical](#extractallgroups-vertical). ::: @@ -499,7 +501,9 @@ The regular expression can contain the metasymbols `%` and `_`. Use the backslash (`\`) for escaping metasymbols. See the note on escaping in the description of the ‘match’ function. -Matching is based on UTF-8, e.g. `_` matches the Unicode code point `¥` which is stored in UTF-8 using two bytes. +Matching is based on UTF-8, e.g. `_` matches the Unicode code point `¥` which is represented in UTF-8 using two bytes. +If the haystack or pattern contain a sequence of bytes that are not valid UTF-8, then the behavior is undefined. +No automatic Unicode normalization is performed, if you need it you can use the [normalizeUTF8*()](https://clickhouse.com/docs/en/sql-reference/functions/string-functions/) functions for that. For regular expressions like `%needle%`, the code is more optimal and works as fast as the `position` function. For other regular expressions, the code is the same as for the ‘match’ function. @@ -512,6 +516,8 @@ The same thing as ‘like’, but negative. Case insensitive variant of [like](https://clickhouse.com/docs/en/sql-reference/functions/string-search-functions/#function-like) function. You can use `ILIKE` operator instead of the `ilike` function. +The function ignores the language, e.g. for Turkish (i/İ), the result might be incorrect. + **Syntax** ``` sql @@ -580,7 +586,7 @@ Same as `ngramDistance` but calculates the non-symmetric difference between `nee For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`. -:::note +:::note For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables – collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function – we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one – this works for Latin and mostly for all Cyrillic letters. ::: From d64935259c1aec0956ab46bfa1f467d7781e68bc Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Fri, 17 Jun 2022 14:57:51 -0400 Subject: [PATCH 30/37] add ClickHouse Keeper to replication doc --- .../mergetree-family/replication.md | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index cbe586d75a3..acbd4a0eece 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -27,7 +27,7 @@ Compressed data for `INSERT` and `ALTER` queries is replicated (for more informa - The `DROP TABLE` query deletes the replica located on the server where the query is run. - The `RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas. -ClickHouse uses [Apache ZooKeeper](https://zookeeper.apache.org) for storing replicas meta information. Use ZooKeeper version 3.4.5 or newer. +ClickHouse uses [ClickHouse Keeper](../../../guides/sre/keeper/clickhouse-keeper.md) for storing replicas meta information. It is possible to use ZooKeeper version 3.4.5 or newer, but ClickHouse Keeper is recommended. To use replication, set parameters in the [zookeeper](../../../operations/server-configuration-parameters/settings.md#server-settings_zookeeper) server configuration section. @@ -35,7 +35,7 @@ To use replication, set parameters in the [zookeeper](../../../operations/server Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem. ::: -Example of setting the addresses of the ZooKeeper cluster: +Example of setting the addresses of the ClickHouse Keeper cluster: ``` xml @@ -55,7 +55,7 @@ Example of setting the addresses of the ZooKeeper cluster: ``` ClickHouse also supports to store replicas meta information in the auxiliary ZooKeeper cluster by providing ZooKeeper cluster name and path as engine arguments. -In other word, it supports to store the metadata of differnt tables in different ZooKeeper clusters. +In other word, it supports to store the metadata of different tables in different ZooKeeper clusters. Example of setting the addresses of the auxiliary ZooKeeper cluster: @@ -122,8 +122,8 @@ The `Replicated` prefix is added to the table engine name. For example:`Replicat **Replicated\*MergeTree parameters** -- `zoo_path` — The path to the table in ZooKeeper. -- `replica_name` — The replica name in ZooKeeper. +- `zoo_path` — The path to the table in ClickHouse Keeper. +- `replica_name` — The replica name in ClickHouse Keeper. - `other_parameters` — Parameters of an engine which is used for creating the replicated version, for example, version in `ReplacingMergeTree`. Example: @@ -168,18 +168,18 @@ Example: ``` -The path to the table in ZooKeeper should be unique for each replicated table. Tables on different shards should have different paths. +The path to the table in ClickHouse Keeper should be unique for each replicated table. Tables on different shards should have different paths. In this case, the path consists of the following parts: `/clickhouse/tables/` is the common prefix. We recommend using exactly this one. `{layer}-{shard}` is the shard identifier. In this example it consists of two parts, since the example cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier. -`table_name` is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it does not change after a RENAME query. +`table_name` is the name of the node for the table in ClickHouse Keeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it does not change after a RENAME query. *HINT*: you could add a database name in front of `table_name` as well. E.g. `db_name.table_name` The two built-in substitutions `{database}` and `{table}` can be used, they expand into the table name and the database name respectively (unless these macros are defined in the `macros` section). So the zookeeper path can be specified as `'/clickhouse/tables/{layer}-{shard}/{database}/{table}'`. -Be careful with table renames when using these built-in substitutions. The path in Zookeeper cannot be changed, and when the table is renamed, the macros will expand into a different path, the table will refer to a path that does not exist in Zookeeper, and will go into read-only mode. +Be careful with table renames when using these built-in substitutions. The path in ClickHouse Keeper cannot be changed, and when the table is renamed, the macros will expand into a different path, the table will refer to a path that does not exist in ClickHouse Keeper, and will go into read-only mode. The replica name identifies different replicas of the same table. You can use the server name for this, as in the example. The name only needs to be unique within each shard. @@ -220,21 +220,21 @@ To delete a replica, run `DROP TABLE`. However, only one replica is deleted – ## Recovery After Failures {#recovery-after-failures} -If ZooKeeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ZooKeeper. +If ClickHouse Keeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ClickHouse Keeper. -If ZooKeeper is unavailable during an `INSERT`, or an error occurs when interacting with ZooKeeper, an exception is thrown. +If ClickHouse Keeper is unavailable during an `INSERT`, or an error occurs when interacting with ClickHouse Keeper, an exception is thrown. -After connecting to ZooKeeper, the system checks whether the set of data in the local file system matches the expected set of data (ZooKeeper stores this information). If there are minor inconsistencies, the system resolves them by syncing data with the replicas. +After connecting to ClickHouse Keeper, the system checks whether the set of data in the local file system matches the expected set of data (ClickHouse Keeper stores this information). If there are minor inconsistencies, the system resolves them by syncing data with the replicas. -If the system detects broken data parts (with the wrong size of files) or unrecognized parts (parts written to the file system but not recorded in ZooKeeper), it moves them to the `detached` subdirectory (they are not deleted). Any missing parts are copied from the replicas. +If the system detects broken data parts (with the wrong size of files) or unrecognized parts (parts written to the file system but not recorded in ClickHouse Keeper), it moves them to the `detached` subdirectory (they are not deleted). Any missing parts are copied from the replicas. Note that ClickHouse does not perform any destructive actions such as automatically deleting a large amount of data. -When the server starts (or establishes a new session with ZooKeeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a `SELECT` query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary. +When the server starts (or establishes a new session with ClickHouse Keeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a `SELECT` query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary. If the local set of data differs too much from the expected one, a safety mechanism is triggered. The server enters this in the log and refuses to launch. The reason for this is that this case may indicate a configuration error, such as if a replica on a shard was accidentally configured like a replica on a different shard. However, the thresholds for this mechanism are set fairly low, and this situation might occur during normal failure recovery. In this case, data is restored semi-automatically - by “pushing a button”. -To start recovery, create the node `/path_to_table/replica_name/flags/force_restore_data` in ZooKeeper with any content, or run the command to restore all replicated tables: +To start recovery, create the node `/path_to_table/replica_name/flags/force_restore_data` in ClickHouse Keeper with any content, or run the command to restore all replicated tables: ``` bash sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data @@ -249,11 +249,11 @@ If all data and metadata disappeared from one of the servers, follow these steps 1. Install ClickHouse on the server. Define substitutions correctly in the config file that contains the shard identifier and replicas, if you use them. 2. If you had unreplicated tables that must be manually duplicated on the servers, copy their data from a replica (in the directory `/var/lib/clickhouse/data/db_name/table_name/`). 3. Copy table definitions located in `/var/lib/clickhouse/metadata/` from a replica. If a shard or replica identifier is defined explicitly in the table definitions, correct it so that it corresponds to this replica. (Alternatively, start the server and make all the `ATTACH TABLE` queries that should have been in the .sql files in `/var/lib/clickhouse/metadata/`.) -4. To start recovery, create the ZooKeeper node `/path_to_table/replica_name/flags/force_restore_data` with any content, or run the command to restore all replicated tables: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` +4. To start recovery, create the ClickHouse Keeper node `/path_to_table/replica_name/flags/force_restore_data` with any content, or run the command to restore all replicated tables: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` Then start the server (restart, if it is already running). Data will be downloaded from replicas. -An alternative recovery option is to delete information about the lost replica from ZooKeeper (`/path_to_table/replica_name`), then create the replica again as described in “[Creating replicated tables](#creating-replicated-tables)”. +An alternative recovery option is to delete information about the lost replica from ClickHouse Keeper (`/path_to_table/replica_name`), then create the replica again as described in “[Creating replicated tables](#creating-replicated-tables)”. There is no restriction on network bandwidth during recovery. Keep this in mind if you are restoring many replicas at once. @@ -276,13 +276,13 @@ Create a MergeTree table with a different name. Move all the data from the direc If you want to get rid of a `ReplicatedMergeTree` table without launching the server: - Delete the corresponding `.sql` file in the metadata directory (`/var/lib/clickhouse/metadata/`). -- Delete the corresponding path in ZooKeeper (`/path_to_table/replica_name`). +- Delete the corresponding path in ClickHouse Keeper (`/path_to_table/replica_name`). After this, you can launch the server, create a `MergeTree` table, move the data to its directory, and then restart the server. -## Recovery When Metadata in the Zookeeper Cluster Is Lost or Damaged {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged} +## Recovery When Metadata in the ClickHouse Keeper Cluster Is Lost or Damaged {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged} -If the data in ZooKeeper was lost or damaged, you can save data by moving it to an unreplicated table as described above. +If the data in ClickHouse Keeper was lost or damaged, you can save data by moving it to an unreplicated table as described above. **See Also** From b6bd08e37714e77d9a7f538781e7743d8965ffed Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Fri, 17 Jun 2022 15:44:44 -0400 Subject: [PATCH 31/37] Update docs/en/engines/table-engines/mergetree-family/replication.md Co-authored-by: Antonio Andelic --- docs/en/engines/table-engines/mergetree-family/replication.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index acbd4a0eece..b182a82b01a 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -55,7 +55,7 @@ Example of setting the addresses of the ClickHouse Keeper cluster: ``` ClickHouse also supports to store replicas meta information in the auxiliary ZooKeeper cluster by providing ZooKeeper cluster name and path as engine arguments. -In other word, it supports to store the metadata of different tables in different ZooKeeper clusters. +In other word, it supports storing the metadata of different tables in different ZooKeeper clusters. Example of setting the addresses of the auxiliary ZooKeeper cluster: From 0cd294c0c4ae51f5e3049d02964918ae70eca5b8 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Fri, 17 Jun 2022 16:09:28 -0400 Subject: [PATCH 32/37] reword --- docs/en/engines/table-engines/mergetree-family/replication.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index b182a82b01a..3562bdf6d3a 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -54,8 +54,8 @@ Example of setting the addresses of the ClickHouse Keeper cluster: ``` -ClickHouse also supports to store replicas meta information in the auxiliary ZooKeeper cluster by providing ZooKeeper cluster name and path as engine arguments. -In other word, it supports storing the metadata of different tables in different ZooKeeper clusters. +ClickHouse also supports storing replicas meta information in an auxiliary ZooKeeper cluster. Do this by providing the ZooKeeper cluster name and path as engine arguments. +In other words, it supports storing the metadata of different tables in different ZooKeeper clusters. Example of setting the addresses of the auxiliary ZooKeeper cluster: From 54e599161d4d4a70e36dcaa1065311198aa58130 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Sat, 18 Jun 2022 00:16:45 +0200 Subject: [PATCH 33/37] Check row size to avoid out of bounds access --- src/Processors/Transforms/PostgreSQLSource.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Processors/Transforms/PostgreSQLSource.cpp b/src/Processors/Transforms/PostgreSQLSource.cpp index 6926ac26bbc..77c2fc41aa1 100644 --- a/src/Processors/Transforms/PostgreSQLSource.cpp +++ b/src/Processors/Transforms/PostgreSQLSource.cpp @@ -1,4 +1,5 @@ #include "PostgreSQLSource.h" +#include "Common/Exception.h" #if USE_LIBPQXX #include @@ -22,6 +23,10 @@ namespace DB { +namespace ErrorCodes +{ + extern const int TOO_MANY_COLUMNS; +} template PostgreSQLSource::PostgreSQLSource( @@ -123,6 +128,11 @@ Chunk PostgreSQLSource::generate() if (!row) break; + if (row->size() > description.sample_block.columns()) + throw Exception(ErrorCodes::TOO_MANY_COLUMNS, + "Row has too many columns: {}, expected structure: {}", + row->size(), description.sample_block.dumpStructure()); + for (const auto idx : collections::range(0, row->size())) { const auto & sample = description.sample_block.getByPosition(idx); From c107a58c11d2eaa28ae2da3e2f82988e970279c6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 18 Jun 2022 04:18:08 +0300 Subject: [PATCH 34/37] Update 02293_test_zstd_window_log_max.sh --- tests/queries/0_stateless/02293_test_zstd_window_log_max.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh b/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh index 39d1b443739..0683d194738 100755 --- a/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh +++ b/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh @@ -8,4 +8,4 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reuse the test data in 01946_test_zstd_decompression_with_escape_sequence_at_the_end_of_buffer.sh $CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max=20" 2>&1 | grep -c \ "Code: 561. DB::Exception: Zstd stream encoding failed: error 'Frame requires too much memory for decoding'; zstd version: 1.5.0: While executing File. (ZSTD_DECODER_FAILED)" -$CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max=21" \ No newline at end of file +$CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max=21" From ac5886939b97a1989a39e5ab116d86ae8c5b4f0f Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sat, 18 Jun 2022 00:27:57 -0300 Subject: [PATCH 35/37] Update index.md --- .../sql-reference/window-functions/index.md | 369 ++++++++++++++++++ 1 file changed, 369 insertions(+) diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index e53ea41d606..5dca61bb8b0 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -55,3 +55,372 @@ https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html https://dev.mysql.com/doc/refman/8.0/en/window-functions-usage.html https://dev.mysql.com/doc/refman/8.0/en/window-functions-frames.html + +## Syntax + +```text +aggregate_function (column_name) + OVER ([PARTITION BY groupping_column] [ORDER BY sorting_column] + [ROWS or RANGE expression_to_bounds_of_frame]) +``` + +- `PARTITION BY` - defines how to break a resultset into groups. +- `ORDER BY` - defines how to order rows inside the group during calculation aggregate_function. +- `ROWS or RANGE` - defines bounds of a frame, aggregate_function is calculated within a frame. + +```text + PARTITION +┌─────────────────┐ <-- UNBOUNDED PRECEDING (BEGINING of the PARTITION) +│ │ +│ │ +│=================│ <-- N PRECEDING <─┐ +│ N ROWS │ │ F +│ Before CURRENT │ │ R +│~~~~~~~~~~~~~~~~~│ <-- CURRENT ROW │ A +│ M ROWS │ │ M +│ After CURRENT │ │ E +│=================│ <-- M FOLLOWING <─┘ +│ │ +│ │ +└─────────────────┘ <--- UNBOUNDED FOLLOWING (END of the PARTITION) +``` + +## Examples + +```sql +CREATE TABLE wf_partition +( + `part_key` UInt64, + `value` UInt64 +) +ENGINE = Memory; + +INSERT INTO wf_partition FORMAT Values + (1,1,1), (1,2,2), (1,3,3), (2,0,0), (3,0,0); + +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key) AS frame_values +FROM wf_partition +ORDER BY + part_key ASC, + value ASC; + +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1,2,3] │ <┐ +│ 1 │ 2 │ 2 │ [1,2,3] │ │ 1-st group +│ 1 │ 3 │ 3 │ [1,2,3] │ <┘ +│ 2 │ 0 │ 0 │ [0] │ <- 2-nd group +│ 3 │ 0 │ 0 │ [0] │ <- 3-d group +└──────────┴───────┴───────┴──────────────┘ +``` + +```sql +CREATE TABLE wf_frame +( + `part_key` UInt64, + `value` UInt64, + `order` UInt64 +) +ENGINE = Memory; + +INSERT INTO wf_frame FORMAT Values + (1,1,1), (1,2,2), (1,3,3), (1,4,4), (1,5,5); + +-- frame is bounded by bounds of a partition (BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC + Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; + +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1,2,3,4,5] │ +│ 1 │ 2 │ 2 │ [1,2,3,4,5] │ +│ 1 │ 3 │ 3 │ [1,2,3,4,5] │ +│ 1 │ 4 │ 4 │ [1,2,3,4,5] │ +│ 1 │ 5 │ 5 │ [1,2,3,4,5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- short form - no bound expression, no order by +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1,2,3,4,5] │ +│ 1 │ 2 │ 2 │ [1,2,3,4,5] │ +│ 1 │ 3 │ 3 │ [1,2,3,4,5] │ +│ 1 │ 4 │ 4 │ [1,2,3,4,5] │ +│ 1 │ 5 │ 5 │ [1,2,3,4,5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- frame is bounded by the beggining of a partition and the current row +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC + Rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; + +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1] │ +│ 1 │ 2 │ 2 │ [1,2] │ +│ 1 │ 3 │ 3 │ [1,2,3] │ +│ 1 │ 4 │ 4 │ [1,2,3,4] │ +│ 1 │ 5 │ 5 │ [1,2,3,4,5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- short form (frame is bounded by the beggining of a partition and the current row) +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1] │ +│ 1 │ 2 │ 2 │ [1,2] │ +│ 1 │ 3 │ 3 │ [1,2,3] │ +│ 1 │ 4 │ 4 │ [1,2,3,4] │ +│ 1 │ 5 │ 5 │ [1,2,3,4,5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- frame is bounded by the beggining of a partition and the current row, but order is backward +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order DESC) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [5,4,3,2,1] │ +│ 1 │ 2 │ 2 │ [5,4,3,2] │ +│ 1 │ 3 │ 3 │ [5,4,3] │ +│ 1 │ 4 │ 4 │ [5,4] │ +│ 1 │ 5 │ 5 │ [5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- sliding frame - 1 PRECEDING ROW AND CURRENT ROW +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC + Rows BETWEEN 1 PRECEDING AND CURRENT ROW) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; + +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1] │ +│ 1 │ 2 │ 2 │ [1,2] │ +│ 1 │ 3 │ 3 │ [2,3] │ +│ 1 │ 4 │ 4 │ [3,4] │ +│ 1 │ 5 │ 5 │ [4,5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- sliding frame - Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC + Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1,2,3,4,5] │ +│ 1 │ 2 │ 2 │ [1,2,3,4,5] │ +│ 1 │ 3 │ 3 │ [2,3,4,5] │ +│ 1 │ 4 │ 4 │ [3,4,5] │ +│ 1 │ 5 │ 5 │ [4,5] │ +└──────────┴───────┴───────┴──────────────┘ +``` + +## Real world examples + +### Maximum/total salary per department. + +```sql +CREATE TABLE employees +( + `department` String, + `employee_name` String, + `salary` Float +) +ENGINE = Memory; + +INSERT INTO employees FORMAT Values + ('Finance', 'Jonh', 200), + ('Finance', 'Joan', 210), + ('Finance', 'Jean', 505), + ('IT', 'Tim', 200), + ('IT', 'Anna', 300), + ('IT', 'Elen', 500); + +SELECT + department, + employee_name AS emp, + salary, + max_salary_per_dep, + total_salary_per_dep, + round((salary / total_salary_per_dep) * 100, 2) AS `share_per_dep(%)` +FROM +( + SELECT + department, + employee_name, + salary, + max(salary) OVER wndw AS max_salary_per_dep, + sum(salary) OVER wndw AS total_salary_per_dep + FROM employees + WINDOW wndw AS (PARTITION BY department + rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) + ORDER BY + department ASC, + employee_name ASC +); + +┌─department─┬─emp──┬─salary─┬─max_salary_per_dep─┬─total_salary_per_dep─┬─share_per_dep(%)─┐ +│ Finance │ Jean │ 505 │ 505 │ 915 │ 55.19 │ +│ Finance │ Joan │ 210 │ 505 │ 915 │ 22.95 │ +│ Finance │ Jonh │ 200 │ 505 │ 915 │ 21.86 │ +│ IT │ Anna │ 300 │ 500 │ 1000 │ 30 │ +│ IT │ Elen │ 500 │ 500 │ 1000 │ 50 │ +│ IT │ Tim │ 200 │ 500 │ 1000 │ 20 │ +└────────────┴──────┴────────┴────────────────────┴──────────────────────┴──────────────────┘ +``` + +### Cumulative sum. + +```sql +CREATE TABLE events +( + `metric` String, + `ts` DateTime, + `value` Float +) +ENGINE = Memory + +INSERT INTO warehouse VALUES + ('sku38', '2020-01-01', 9), + ('sku38', '2020-02-01', 1), + ('sku38', '2020-03-01', -4), + ('sku1', '2020-01-01', 1), + ('sku1', '2020-02-01', 1), + ('sku1', '2020-03-01', 1); + +SELECT + item, + ts, + value, + sum(value) OVER (PARTITION BY item ORDER BY ts ASC) AS stock_balance +FROM warehouse +ORDER BY + item ASC, + ts ASC; + +┌─item──┬──────────────────ts─┬─value─┬─stock_balance─┐ +│ sku1 │ 2020-01-01 00:00:00 │ 1 │ 1 │ +│ sku1 │ 2020-02-01 00:00:00 │ 1 │ 2 │ +│ sku1 │ 2020-03-01 00:00:00 │ 1 │ 3 │ +│ sku38 │ 2020-01-01 00:00:00 │ 9 │ 9 │ +│ sku38 │ 2020-02-01 00:00:00 │ 1 │ 10 │ +│ sku38 │ 2020-03-01 00:00:00 │ -4 │ 6 │ +└───────┴─────────────────────┴───────┴───────────────┘ +``` + +### Moving / Sliding Average (per 3 rows) + +```sql +CREATE TABLE sensors +( + `metric` String, + `ts` DateTime, + `value` Float +) +ENGINE = Memory; + +insert into sensors values('cpu_temp', '2020-01-01 00:00:00', 87), + ('cpu_temp', '2020-01-01 00:00:01', 77), + ('cpu_temp', '2020-01-01 00:00:02', 93), + ('cpu_temp', '2020-01-01 00:00:03', 87), + ('cpu_temp', '2020-01-01 00:00:04', 87), + ('cpu_temp', '2020-01-01 00:00:05', 87), + ('cpu_temp', '2020-01-01 00:00:06', 87), + ('cpu_temp', '2020-01-01 00:00:07', 87); +SELECT + metric, + ts, + value, + avg(value) OVER + (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN 2 PRECEDING AND CURRENT ROW) + AS moving_avg_temp +FROM sensors +ORDER BY + metric ASC, + ts ASC; + +┌─metric───┬──────────────────ts─┬─value─┬───moving_avg_temp─┐ +│ cpu_temp │ 2020-01-01 00:00:00 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:00:01 │ 77 │ 82 │ +│ cpu_temp │ 2020-01-01 00:00:02 │ 93 │ 85.66666666666667 │ +│ cpu_temp │ 2020-01-01 00:00:03 │ 87 │ 85.66666666666667 │ +│ cpu_temp │ 2020-01-01 00:00:04 │ 87 │ 89 │ +│ cpu_temp │ 2020-01-01 00:00:05 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:00:06 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:00:07 │ 87 │ 87 │ +└──────────┴─────────────────────┴───────┴───────────────────┘ +``` + +### Moving / Sliding Average (per 10 seconds) + +```sql +SELECT + metric, + ts, + value, + avg(value) OVER (PARTITION BY metric ORDER BY ts + Range BETWEEN 10 PRECEDING AND CURRENT ROW) AS moving_avg_10_seconds_temp +FROM sensors +ORDER BY + metric ASC, + ts ASC; + +┌─metric───┬──────────────────ts─┬─value─┬─moving_avg_10_seconds_temp─┐ +│ cpu_temp │ 2020-01-01 00:00:00 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:01:10 │ 77 │ 77 │ +│ cpu_temp │ 2020-01-01 00:02:20 │ 93 │ 93 │ +│ cpu_temp │ 2020-01-01 00:03:30 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:04:40 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:05:50 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:06:00 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:07:10 │ 87 │ 87 │ +└──────────┴─────────────────────┴───────┴────────────────────────────┘ +``` From 16e8b85fbfe86a32a960fcd94331a33aa3b2fc3f Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 18 Jun 2022 14:08:14 +0200 Subject: [PATCH 36/37] Revert "Add a setting to use more memory for zstd decompression" --- src/Core/Settings.h | 2 -- src/IO/CompressionMethod.cpp | 9 +++++---- src/IO/CompressionMethod.h | 3 +-- src/IO/ZstdInflatingReadBuffer.cpp | 8 +------- src/IO/ZstdInflatingReadBuffer.h | 3 +-- src/Server/HTTPHandler.cpp | 2 +- src/Storages/HDFS/StorageHDFS.cpp | 6 ++---- src/Storages/StorageFile.cpp | 3 +-- src/Storages/StorageS3.cpp | 7 ++----- src/Storages/StorageURL.cpp | 6 ++---- .../02293_test_zstd_window_log_max.reference | 2 -- .../0_stateless/02293_test_zstd_window_log_max.sh | 11 ----------- 12 files changed, 16 insertions(+), 46 deletions(-) delete mode 100644 tests/queries/0_stateless/02293_test_zstd_window_log_max.reference delete mode 100755 tests/queries/0_stateless/02293_test_zstd_window_log_max.sh diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a16e338f7ab..e3f756c85f5 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -183,8 +183,6 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) \ M(Int64, network_zstd_compression_level, 1, "Allows you to select the level of ZSTD compression.", 0) \ \ - M(Int64, zstd_window_log_max, 0, "Allows you to select the max window log of ZSTD (it will not be used for MergeTree family)", 0) \ - \ M(UInt64, priority, 0, "Priority of the query. 1 - the highest, higher value - lower priority; 0 - do not use priorities.", 0) \ M(Int64, os_thread_priority, 0, "If non zero - set corresponding 'nice' value for query processing threads. Can be used to adjust query priority for OS scheduler.", 0) \ \ diff --git a/src/IO/CompressionMethod.cpp b/src/IO/CompressionMethod.cpp index 0da235c074c..fe4772948ad 100644 --- a/src/IO/CompressionMethod.cpp +++ b/src/IO/CompressionMethod.cpp @@ -99,7 +99,7 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s } static std::unique_ptr createCompressedWrapper( - std::unique_ptr nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment, int zstd_window_log_max) + std::unique_ptr nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment) { if (method == CompressionMethod::Gzip || method == CompressionMethod::Zlib) return std::make_unique(std::move(nested), method, buf_size, existing_memory, alignment); @@ -110,7 +110,7 @@ static std::unique_ptr createCompressedWrapper( if (method == CompressionMethod::Xz) return std::make_unique(std::move(nested), buf_size, existing_memory, alignment); if (method == CompressionMethod::Zstd) - return std::make_unique(std::move(nested), buf_size, existing_memory, alignment, zstd_window_log_max); + return std::make_unique(std::move(nested), buf_size, existing_memory, alignment); if (method == CompressionMethod::Lz4) return std::make_unique(std::move(nested), buf_size, existing_memory, alignment); #if USE_BZIP2 @@ -126,13 +126,14 @@ static std::unique_ptr createCompressedWrapper( } std::unique_ptr wrapReadBufferWithCompressionMethod( - std::unique_ptr nested, CompressionMethod method, int zstd_window_log_max, size_t buf_size, char * existing_memory, size_t alignment) + std::unique_ptr nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment) { if (method == CompressionMethod::None) return nested; - return createCompressedWrapper(std::move(nested), method, buf_size, existing_memory, alignment, zstd_window_log_max); + return createCompressedWrapper(std::move(nested), method, buf_size, existing_memory, alignment); } + std::unique_ptr wrapWriteBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, int level, size_t buf_size, char * existing_memory, size_t alignment) { diff --git a/src/IO/CompressionMethod.h b/src/IO/CompressionMethod.h index a399a756c13..3953ba9d212 100644 --- a/src/IO/CompressionMethod.h +++ b/src/IO/CompressionMethod.h @@ -5,6 +5,7 @@ #include + namespace DB { class ReadBuffer; @@ -49,12 +50,10 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s std::unique_ptr wrapReadBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, - int zstd_window_log_max = 0, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0); - std::unique_ptr wrapWriteBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, diff --git a/src/IO/ZstdInflatingReadBuffer.cpp b/src/IO/ZstdInflatingReadBuffer.cpp index 0d026cdab9a..712ea6960ef 100644 --- a/src/IO/ZstdInflatingReadBuffer.cpp +++ b/src/IO/ZstdInflatingReadBuffer.cpp @@ -8,7 +8,7 @@ namespace ErrorCodes extern const int ZSTD_DECODER_FAILED; } -ZstdInflatingReadBuffer::ZstdInflatingReadBuffer(std::unique_ptr in_, size_t buf_size, char * existing_memory, size_t alignment, int zstd_window_log_max) +ZstdInflatingReadBuffer::ZstdInflatingReadBuffer(std::unique_ptr in_, size_t buf_size, char * existing_memory, size_t alignment) : CompressedReadBufferWrapper(std::move(in_), buf_size, existing_memory, alignment) { dctx = ZSTD_createDCtx(); @@ -19,12 +19,6 @@ ZstdInflatingReadBuffer::ZstdInflatingReadBuffer(std::unique_ptr in_ { throw Exception(ErrorCodes::ZSTD_DECODER_FAILED, "zstd_stream_decoder init failed: zstd version: {}", ZSTD_VERSION_STRING); } - - size_t ret = ZSTD_DCtx_setParameter(dctx, ZSTD_d_windowLogMax, zstd_window_log_max); - if (ZSTD_isError(ret)) - { - throw Exception(ErrorCodes::ZSTD_DECODER_FAILED, "zstd_stream_decoder init failed: {}", ZSTD_getErrorName(ret)); - } } ZstdInflatingReadBuffer::~ZstdInflatingReadBuffer() diff --git a/src/IO/ZstdInflatingReadBuffer.h b/src/IO/ZstdInflatingReadBuffer.h index faa6231d4e2..a0c20b79d80 100644 --- a/src/IO/ZstdInflatingReadBuffer.h +++ b/src/IO/ZstdInflatingReadBuffer.h @@ -20,8 +20,7 @@ public: std::unique_ptr in_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, - size_t alignment = 0, - int zstd_window_log_max = 0); + size_t alignment = 0); ~ZstdInflatingReadBuffer() override; diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index cdf856e87d5..39870fc91dc 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -649,7 +649,7 @@ void HTTPHandler::processQuery( /// Request body can be compressed using algorithm specified in the Content-Encoding header. String http_request_compression_method_str = request.get("Content-Encoding", ""); auto in_post = wrapReadBufferWithCompressionMethod( - wrapReadBufferReference(request.getStream()), chooseCompressionMethod({}, http_request_compression_method_str), context->getSettingsRef().zstd_window_log_max); + wrapReadBufferReference(request.getStream()), chooseCompressionMethod({}, http_request_compression_method_str)); /// The data can also be compressed using incompatible internal algorithm. This is indicated by /// 'decompress' query parameter. diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 708bfd5ef8b..2edcbeb9a7e 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -197,9 +197,8 @@ ColumnsDescription StorageHDFS::getTableStructureFromData( if (it == paths.end()) return nullptr; auto compression = chooseCompressionMethod(*it, compression_method); - auto zstd_window_log_max = ctx->getSettingsRef().zstd_window_log_max; return wrapReadBufferWithCompressionMethod( - std::make_unique(uri_without_path, *it++, ctx->getGlobalContext()->getConfigRef()), compression, zstd_window_log_max); + std::make_unique(uri_without_path, *it++, ctx->getGlobalContext()->getConfigRef()), compression); }; return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, paths.size() > 1, ctx); } @@ -328,8 +327,7 @@ bool HDFSSource::initialize() const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_path); auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); - const auto zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef()), compression, zstd_window_log_max); + read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef()), compression); auto input_format = getContext()->getInputFormat(storage->format_name, *read_buf, block_for_format, max_block_size); diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 2fa6003c0eb..d466096c8ba 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -208,8 +208,7 @@ std::unique_ptr createReadBuffer( in.setProgressCallback(context); } - auto zstd_window_log_max = context->getSettingsRef().zstd_window_log_max; - return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method, zstd_window_log_max); + return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); } } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index b4b97570ad1..f524a405c9b 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -434,8 +434,7 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; - auto zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint), zstd_window_log_max); + read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint)); auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; @@ -1171,12 +1170,10 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( read_keys_in_distributed_processing->push_back(key); first = false; - const auto zstd_window_log_max = ctx->getSettingsRef().zstd_window_log_max; return wrapReadBufferWithCompressionMethod( std::make_unique( s3_configuration.client, s3_configuration.uri.bucket, key, s3_configuration.uri.version_id, s3_configuration.rw_settings.max_single_read_retries, ctx->getReadSettings()), - chooseCompressionMethod(key, compression_method), - zstd_window_log_max); + chooseCompressionMethod(key, compression_method)); }; return readSchemaFromFormat(format, format_settings, read_buffer_iterator, is_key_with_globs, ctx); diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index a90b6974c74..cd55c32fb9c 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -350,8 +350,7 @@ namespace std::move(read_buffer_factory), threadPoolCallbackRunner(IOThreadPool::get()), download_threads), - chooseCompressionMethod(request_uri.getPath(), compression_method), - settings.zstd_window_log_max); + chooseCompressionMethod(request_uri.getPath(), compression_method)); } } catch (const Poco::Exception & e) @@ -382,8 +381,7 @@ namespace delay_initialization, /* use_external_buffer */ false, /* skip_url_not_found_error */ skip_url_not_found_error), - chooseCompressionMethod(request_uri.getPath(), compression_method), - settings.zstd_window_log_max); + chooseCompressionMethod(request_uri.getPath(), compression_method)); } catch (...) { diff --git a/tests/queries/0_stateless/02293_test_zstd_window_log_max.reference b/tests/queries/0_stateless/02293_test_zstd_window_log_max.reference deleted file mode 100644 index 98ca7fb2d29..00000000000 --- a/tests/queries/0_stateless/02293_test_zstd_window_log_max.reference +++ /dev/null @@ -1,2 +0,0 @@ -1 -40 diff --git a/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh b/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh deleted file mode 100755 index 0683d194738..00000000000 --- a/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash -# Tags: no-parallel - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -# reuse the test data in 01946_test_zstd_decompression_with_escape_sequence_at_the_end_of_buffer.sh -$CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max=20" 2>&1 | grep -c \ - "Code: 561. DB::Exception: Zstd stream encoding failed: error 'Frame requires too much memory for decoding'; zstd version: 1.5.0: While executing File. (ZSTD_DECODER_FAILED)" -$CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max=21" From aebc090701b0bf7226e1caabb39eab37a6a7061a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 18 Jun 2022 15:56:37 +0300 Subject: [PATCH 37/37] Update index.md --- docs/en/sql-reference/window-functions/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index 5dca61bb8b0..d7f4a696476 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -70,7 +70,7 @@ aggregate_function (column_name) ```text PARTITION -┌─────────────────┐ <-- UNBOUNDED PRECEDING (BEGINING of the PARTITION) +┌─────────────────┐ <-- UNBOUNDED PRECEDING (BEGINNING of the PARTITION) │ │ │ │ │=================│ <-- N PRECEDING <─┐