From aa092eeffb8f09d9d65294bfa2a62cacc258e562 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 27 Dec 2021 16:42:06 +0300 Subject: [PATCH 01/58] proper handle of 'max_rows_to_read' in case of reading in order of sorting key and limit --- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 14 ++++++++++-- .../MergeTree/MergeTreeSelectProcessor.cpp | 5 ----- ...5_read_in_order_max_rows_to_read.reference | 6 +++++ .../02155_read_in_order_max_rows_to_read.sql | 22 +++++++++++++++++++ 4 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference create mode 100644 tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index cdedd37e14a..07ac6f5764b 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -875,12 +875,22 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd { std::atomic total_rows{0}; + /// Do not check number of read rows if we have reading + /// in order of sorting key with limit. + /// In general case, when there exists WHERE clause + /// it's impossible to estimate number of rows precisely, + /// because we can stop reading at any time. + SizeLimits limits; - if (settings.read_overflow_mode == OverflowMode::THROW && settings.max_rows_to_read) + if (settings.read_overflow_mode == OverflowMode::THROW + && settings.max_rows_to_read + && !query_info.input_order_info) limits = SizeLimits(settings.max_rows_to_read, 0, settings.read_overflow_mode); SizeLimits leaf_limits; - if (settings.read_overflow_mode_leaf == OverflowMode::THROW && settings.max_rows_to_read_leaf) + if (settings.read_overflow_mode_leaf == OverflowMode::THROW + && settings.max_rows_to_read_leaf + && !query_info.input_order_info) leaf_limits = SizeLimits(settings.max_rows_to_read_leaf, 0, settings.read_overflow_mode_leaf); auto mark_cache = context->getIndexMarkCache(); diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 2d4d3617cee..332eb27094a 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -37,11 +37,6 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( has_limit_below_one_block(has_limit_below_one_block_), total_rows(data_part->index_granularity.getRowsCountInRanges(all_mark_ranges)) { - /// Actually it means that parallel reading from replicas enabled - /// and we have to collaborate with initiator. - /// In this case we won't set approximate rows, because it will be accounted multiple times - if (!extension_.has_value()) - addTotalRowsApprox(total_rows); ordered_names = header_without_virtual_columns.getNames(); } diff --git a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference new file mode 100644 index 00000000000..b73ab43cabb --- /dev/null +++ b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference @@ -0,0 +1,6 @@ +10 +0 +1 +2 +3 +4 diff --git a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql new file mode 100644 index 00000000000..e82c78b5e42 --- /dev/null +++ b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql @@ -0,0 +1,22 @@ +DROP TABLE IF EXISTS t_max_rows_to_read; + +CREATE TABLE t_max_rows_to_read (a UInt64) +ENGINE = MergeTree ORDER BY a +SETTINGS index_granularity = 4; + +INSERT INTO t_max_rows_to_read SELECT number FROM numbers(100); + +SET max_threads = 1; + +SELECT a FROM t_max_rows_to_read WHERE a = 10 SETTINGS max_rows_to_read = 4; + +SELECT a FROM t_max_rows_to_read ORDER BY a LIMIT 5 SETTINGS max_rows_to_read = 12; + +-- This should work, but actually it doesn't. Need to investigate. +-- SELECT a FROM t_max_rows_to_read WHERE a > 10 ORDER BY a LIMIT 5 SETTINGS max_rows_to_read = 20; + +SELECT a FROM t_max_rows_to_read ORDER BY a LIMIT 20 FORMAT Null SETTINGS max_rows_to_read = 12; -- { serverError 158 } +SELECT a FROM t_max_rows_to_read WHERE a > 10 ORDER BY a LIMIT 5 FORMAT Null SETTINGS max_rows_to_read = 12; -- { serverError 158 } +SELECT a FROM t_max_rows_to_read WHERE a = 10 OR a = 20 FORMAT Null SETTINGS max_rows_to_read = 4; -- { serverError 158 } + +DROP TABLE t_max_rows_to_read; From 04e6e6ab5e4818f8774ecf3878ca2205d8e1aae6 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 21 Mar 2022 14:52:26 +0000 Subject: [PATCH 02/58] Add ParallelReadBuffer for S3 --- src/Common/RangeGenerator.h | 46 ++++++++++++++++++++ src/IO/ReadBufferFromS3.h | 73 ++++++++++++++++++++++++++++++++ src/IO/ReadWriteBufferFromHTTP.h | 38 +---------------- src/IO/S3Common.cpp | 23 ++++++++++ src/IO/S3Common.h | 2 + src/Storages/StorageS3.cpp | 8 +++- 6 files changed, 152 insertions(+), 38 deletions(-) create mode 100644 src/Common/RangeGenerator.h diff --git a/src/Common/RangeGenerator.h b/src/Common/RangeGenerator.h new file mode 100644 index 00000000000..8d3b88936fc --- /dev/null +++ b/src/Common/RangeGenerator.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class RangeGenerator +{ +public: + explicit RangeGenerator(size_t total_size_, size_t range_step_, size_t range_start = 0) + : from(range_start), range_step(range_step_), total_size(total_size_) + { + } + + size_t totalRanges() const { return static_cast(round(static_cast(total_size - from) / range_step)); } + + using Range = std::pair; + + // return upper exclusive range of values, i.e. [from_range, to_range> + std::optional nextRange() + { + if (from >= total_size) + { + return std::nullopt; + } + + auto to = from + range_step; + if (to >= total_size) + { + to = total_size; + } + + Range range{from, to}; + from = to; + return std::move(range); + } + +private: + size_t from; + size_t range_step; + size_t total_size; +}; + +} diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 157b6d46b6d..6668f570cb6 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -1,5 +1,6 @@ #pragma once +#include #include #if USE_AWS_S3 @@ -10,6 +11,7 @@ #include #include #include +#include #include @@ -62,6 +64,12 @@ public: size_t getFileOffsetOfBufferEnd() const override { return offset; } + void setRange(off_t begin, off_t end) + { + offset = begin; + read_until_position = end; + } + private: std::unique_ptr initialize(); @@ -76,6 +84,71 @@ private: bool restricted_seek; }; +/// Creates separate ReadBufferFromS3 for sequence of ranges of particular object +class ReadBufferS3Factory : public ParallelReadBuffer::ReadBufferFactory +{ +public: + explicit ReadBufferS3Factory( + std::shared_ptr client_ptr_, + const String & bucket_, + const String & key_, + size_t range_step_, + size_t object_size_, + UInt64 s3_max_single_read_retries_, + const ReadSettings &read_settings) + : client_ptr(client_ptr_) + , bucket(bucket_) + , key(key_) + , settings(read_settings) + , range_generator(object_size_, range_step_) + , range_step(range_step_) + , object_size(object_size_) + , s3_max_single_read_retries(s3_max_single_read_retries_) + { + assert(range_step > 0); + assert(range_step < object_size); + } + + size_t totalRanges() const + { + return static_cast(round(static_cast(object_size) / range_step)); + } + + SeekableReadBufferPtr getReader() override + { + const auto next_range = range_generator.nextRange(); + if (!next_range) + { + return nullptr; + } + + auto reader = std::make_shared( + client_ptr, bucket, key, s3_max_single_read_retries, settings); + reader->setRange(next_range->first, next_range->second - 1); + return reader; + } + + off_t seek(off_t off, [[maybe_unused]] int whence) override + { + range_generator = RangeGenerator{object_size, range_step, static_cast(off)}; + return off; + } + + std::optional getTotalSize() override { return object_size; } + +private: + std::shared_ptr client_ptr; + const String bucket; + const String key; + ReadSettings settings; + + RangeGenerator range_generator; + size_t range_step; + size_t object_size; + + UInt64 s3_max_single_read_retries; +}; + } #endif diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h index 061dd772212..f0c10759142 100644 --- a/src/IO/ReadWriteBufferFromHTTP.h +++ b/src/IO/ReadWriteBufferFromHTTP.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -635,43 +636,6 @@ public: void buildNewSession(const Poco::URI & uri) override { session = makeHTTPSession(uri, timeouts); } }; -class RangeGenerator -{ -public: - explicit RangeGenerator(size_t total_size_, size_t range_step_, size_t range_start = 0) - : from(range_start), range_step(range_step_), total_size(total_size_) - { - } - - size_t totalRanges() const { return static_cast(round(static_cast(total_size - from) / range_step)); } - - using Range = std::pair; - - // return upper exclusive range of values, i.e. [from_range, to_range> - std::optional nextRange() - { - if (from >= total_size) - { - return std::nullopt; - } - - auto to = from + range_step; - if (to >= total_size) - { - to = total_size; - } - - Range range{from, to}; - from = to; - return std::move(range); - } - -private: - size_t from; - size_t range_step; - size_t total_size; -}; - class ReadWriteBufferFromHTTP : public detail::ReadWriteBufferFromHTTPBase> { using Parent = detail::ReadWriteBufferFromHTTPBase>; diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index e63e6fde1f4..971ac0d8a73 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -33,6 +33,8 @@ # include # include +#include // Y_IGNORE +#include // Y_IGNORE namespace { @@ -682,6 +684,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int S3_ERROR; } namespace S3 @@ -839,6 +842,26 @@ namespace S3 throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI: {}{}", quoteString(bucket), !uri.empty() ? " (" + uri.toString() + ")" : ""); } + + size_t getObjectSize(std::shared_ptr client_ptr, const String & bucket, const String & key, bool throw_on_error) + { + Aws::S3::Model::HeadObjectRequest req; + req.SetBucket(bucket); + req.SetKey(key); + + Aws::S3::Model::HeadObjectOutcome outcome = client_ptr->HeadObject(req); + + if (outcome.IsSuccess()) + { + auto read_result = outcome.GetResultWithOwnership(); + return static_cast(read_result.GetContentLength()); + } + else if (throw_on_error) + { + throw DB::Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR); + } + return 0; + } } } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 97cb4f74f90..c33ce427e66 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -75,6 +75,8 @@ struct URI static void validateBucket(const String & bucket, const Poco::URI & uri); }; +size_t getObjectSize(std::shared_ptr client_ptr, const String & bucket, const String & key, bool throw_on_error = true); + } #endif diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index f319bd1097b..6f5228c2c63 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -1,4 +1,6 @@ #include +#include "IO/ParallelReadBuffer.h" +#include "IO/IOThreadPool.h" #include "Parsers/ASTCreateQuery.h" #if USE_AWS_S3 @@ -273,9 +275,13 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; + size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false); + auto factory = std::make_unique(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings()); + read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()), + std::make_unique(std::move(factory), &IOThreadPool::get(), 4), chooseCompressionMethod(current_key, compression_hint)); + auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); From 6785ad165a72e372f155a10e71a872e37bdfc2c1 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 22 Mar 2022 14:06:30 +0000 Subject: [PATCH 03/58] Fix issue for mutliple download threads --- src/IO/ParallelReadBuffer.cpp | 2 +- src/IO/ParallelReadBuffer.h | 4 ++-- src/IO/ReadBufferFromS3.h | 4 ++-- src/Storages/StorageS3.cpp | 20 +++++++++++++++----- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/IO/ParallelReadBuffer.cpp b/src/IO/ParallelReadBuffer.cpp index 7fa10c160ad..3ee62148c23 100644 --- a/src/IO/ParallelReadBuffer.cpp +++ b/src/IO/ParallelReadBuffer.cpp @@ -237,7 +237,7 @@ void ParallelReadBuffer::readerThreadFunction(ReadWorkerPtr read_worker) while (!emergency_stop && !read_worker->cancel) { if (!read_worker->reader->next()) - throw Exception("Failed to read all the data from the reader", ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to read all the data from the reader, missing {} bytes", read_worker->bytes_left); if (emergency_stop || read_worker->cancel) break; diff --git a/src/IO/ParallelReadBuffer.h b/src/IO/ParallelReadBuffer.h index 7b364205e8e..7a3f0cb3773 100644 --- a/src/IO/ParallelReadBuffer.h +++ b/src/IO/ParallelReadBuffer.h @@ -82,8 +82,8 @@ public: std::unique_ptr reader_factory_, ThreadPool * pool, size_t max_working_readers, - WorkerSetup worker_setup = {}, - WorkerCleanup worker_cleanup = {}); + WorkerSetup worker_setup = [](ThreadStatus &){}, + WorkerCleanup worker_cleanup = [](ThreadStatus &){}); ~ParallelReadBuffer() override { finishAndWait(); } diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 6668f570cb6..e3708c4b748 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -60,7 +60,7 @@ public: void setReadUntilPosition(size_t position) override; - Range getRemainingReadRange() const override { return Range{ .left = static_cast(offset), .right = read_until_position }; } + Range getRemainingReadRange() const override { return Range{ .left = static_cast(offset), .right = read_until_position - 1}; } size_t getFileOffsetOfBufferEnd() const override { return offset; } @@ -124,7 +124,7 @@ public: auto reader = std::make_shared( client_ptr, bucket, key, s3_max_single_read_retries, settings); - reader->setRange(next_range->first, next_range->second - 1); + reader->setRange(next_range->first, next_range->second); return reader; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 6f5228c2c63..9dc15e54ecc 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -275,12 +275,22 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; - size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false); - auto factory = std::make_unique(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings()); + const auto max_download_threads = getContext()->getSettings().max_download_threads; + if (max_download_threads == 1) + { + read_buf = wrapReadBufferWithCompressionMethod( + std::make_unique(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()), + chooseCompressionMethod(current_key, compression_hint)); + } + else + { + size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false); + auto factory = std::make_unique(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings()); - read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(std::move(factory), &IOThreadPool::get(), 4), - chooseCompressionMethod(current_key, compression_hint)); + read_buf = wrapReadBufferWithCompressionMethod( + std::make_unique(std::move(factory), &IOThreadPool::get(), getContext()->getSettings().max_download_threads), + chooseCompressionMethod(current_key, compression_hint)); + } auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; From f693eba568415e66d856da7c45df824d97949361 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 22 Mar 2022 14:30:40 +0000 Subject: [PATCH 04/58] fix tests with approx rows --- src/Processors/QueryPlan/ReadFromMergeTree.cpp | 9 ++++++--- src/Storages/MergeTree/MergeTreeIOSettings.h | 2 ++ src/Storages/MergeTree/MergeTreeSelectProcessor.cpp | 8 ++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 1bfc1ec7306..cf7b2dcc855 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -45,7 +45,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -static MergeTreeReaderSettings getMergeTreeReaderSettings(const ContextPtr & context) +static MergeTreeReaderSettings getMergeTreeReaderSettings( + const ContextPtr & context, const SelectQueryInfo & query_info) { const auto & settings = context->getSettingsRef(); return @@ -53,6 +54,7 @@ static MergeTreeReaderSettings getMergeTreeReaderSettings(const ContextPtr & con .read_settings = context->getReadSettings(), .save_marks_in_cache = true, .checksum_on_read = settings.checksum_on_read, + .read_in_order = query_info.input_order_info != nullptr, }; } @@ -82,7 +84,7 @@ ReadFromMergeTree::ReadFromMergeTree( getPrewhereInfo(query_info_), data_.getPartitionValueType(), virt_column_names_)}) - , reader_settings(getMergeTreeReaderSettings(context_)) + , reader_settings(getMergeTreeReaderSettings(context_, query_info_)) , prepared_parts(std::move(parts_)) , real_column_names(std::move(real_column_names_)) , virt_column_names(std::move(virt_column_names_)) @@ -203,6 +205,7 @@ ProcessorPtr ReadFromMergeTree::createSource( .colums_to_read = required_columns }; } + return std::make_shared( data, storage_snapshot, part.data_part, max_block_size, preferred_block_size_bytes, preferred_max_column_in_block_size_bytes, required_columns, part.ranges, use_uncompressed_cache, prewhere_info, @@ -918,7 +921,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead( total_marks_pk += part->index_granularity.getMarksCountWithoutFinal(); parts_before_pk = parts.size(); - auto reader_settings = getMergeTreeReaderSettings(context); + auto reader_settings = getMergeTreeReaderSettings(context, query_info); bool use_skip_indexes = settings.use_skip_indexes; if (select.final() && !settings.use_skip_indexes_if_final) diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h index aaa8fae7dba..4fe424c98ff 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.h +++ b/src/Storages/MergeTree/MergeTreeIOSettings.h @@ -20,6 +20,8 @@ struct MergeTreeReaderSettings bool save_marks_in_cache = false; /// Validate checksums on reading (should be always enabled in production). bool checksum_on_read = true; + /// True if we read in order of sorting key. + bool read_in_order = false; }; struct MergeTreeWriterSettings diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 90e272e329e..f6cbf54b752 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -37,6 +37,14 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( has_limit_below_one_block(has_limit_below_one_block_), total_rows(data_part->index_granularity.getRowsCountInRanges(all_mark_ranges)) { + /// Actually it means that parallel reading from replicas enabled + /// and we have to collaborate with initiator. + /// In this case we won't set approximate rows, because it will be accounted multiple times. + /// Also do not count amount of read rows if we read in order of sorting key, + /// because we don't know actual amount of read rows in case when limit is set. + if (!extension_.has_value() && !reader_settings.read_in_order) + addTotalRowsApprox(total_rows); + ordered_names = header_without_virtual_columns.getNames(); } From f32ef2a556f4b6a667ec2a0ff3d3553703046dc0 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 23 Mar 2022 08:15:18 +0000 Subject: [PATCH 05/58] Small polishing for S3 reader --- src/IO/ReadBufferFromS3.cpp | 4 +++- src/IO/ReadBufferFromS3.h | 48 +++++++++++++++++-------------------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 93bbe02c9cd..05e1276bc63 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -42,6 +42,7 @@ ReadBufferFromS3::ReadBufferFromS3( UInt64 max_single_read_retries_, const ReadSettings & settings_, bool use_external_buffer_, + size_t offset_, size_t read_until_position_, bool restricted_seek_) : SeekableReadBufferWithSize(nullptr, 0) @@ -49,9 +50,10 @@ ReadBufferFromS3::ReadBufferFromS3( , bucket(bucket_) , key(key_) , max_single_read_retries(max_single_read_retries_) + , offset(offset_) + , read_until_position(read_until_position_) , read_settings(settings_) , use_external_buffer(use_external_buffer_) - , read_until_position(read_until_position_) , restricted_seek(restricted_seek_) { } diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index e3708c4b748..8d71a03fb87 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -5,15 +5,15 @@ #if USE_AWS_S3 -#include +# include -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include -#include +# include namespace Aws::S3 { @@ -32,7 +32,9 @@ private: String bucket; String key; UInt64 max_single_read_retries; + off_t offset = 0; + off_t read_until_position = 0; Aws::S3::Model::GetObjectResult read_result; std::unique_ptr impl; @@ -47,6 +49,7 @@ public: UInt64 max_single_read_retries_, const ReadSettings & settings_, bool use_external_buffer = false, + size_t offset_ = 0, size_t read_until_position_ = 0, bool restricted_seek_ = false); @@ -60,16 +63,10 @@ public: void setReadUntilPosition(size_t position) override; - Range getRemainingReadRange() const override { return Range{ .left = static_cast(offset), .right = read_until_position - 1}; } + Range getRemainingReadRange() const override { return Range{.left = static_cast(offset), .right = read_until_position - 1}; } size_t getFileOffsetOfBufferEnd() const override { return offset; } - void setRange(off_t begin, off_t end) - { - offset = begin; - read_until_position = end; - } - private: std::unique_ptr initialize(); @@ -77,8 +74,6 @@ private: bool use_external_buffer; - off_t read_until_position = 0; - /// There is different seek policy for disk seek and for non-disk seek /// (non-disk seek is applied for seekable input formats: orc, arrow, parquet). bool restricted_seek; @@ -95,11 +90,11 @@ public: size_t range_step_, size_t object_size_, UInt64 s3_max_single_read_retries_, - const ReadSettings &read_settings) + const ReadSettings & read_settings_) : client_ptr(client_ptr_) , bucket(bucket_) , key(key_) - , settings(read_settings) + , read_settings(read_settings_) , range_generator(object_size_, range_step_) , range_step(range_step_) , object_size(object_size_) @@ -109,11 +104,6 @@ public: assert(range_step < object_size); } - size_t totalRanges() const - { - return static_cast(round(static_cast(object_size) / range_step)); - } - SeekableReadBufferPtr getReader() override { const auto next_range = range_generator.nextRange(); @@ -123,8 +113,14 @@ public: } auto reader = std::make_shared( - client_ptr, bucket, key, s3_max_single_read_retries, settings); - reader->setRange(next_range->first, next_range->second); + client_ptr, + bucket, + key, + s3_max_single_read_retries, + read_settings, + false /*use_external_buffer*/, + next_range->first, + next_range->second + 1); return reader; } @@ -140,7 +136,7 @@ private: std::shared_ptr client_ptr; const String bucket; const String key; - ReadSettings settings; + ReadSettings read_settings; RangeGenerator range_generator; size_t range_step; From 131b3a091c60a13ed7cd7ddfe090e5dda204b6ad Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 23 Mar 2022 08:40:00 +0000 Subject: [PATCH 06/58] Refactor StorageS3 --- src/IO/ReadBufferFromS3.h | 2 +- src/Storages/StorageS3.cpp | 70 ++++++++++++++++++++++++++------------ src/Storages/StorageS3.h | 6 +++- 3 files changed, 54 insertions(+), 24 deletions(-) diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 8d71a03fb87..d984fbebe5f 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -120,7 +120,7 @@ public: read_settings, false /*use_external_buffer*/, next_range->first, - next_range->second + 1); + next_range->second); return reader; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 9dc15e54ecc..6966d356070 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -238,7 +238,8 @@ StorageS3Source::StorageS3Source( String compression_hint_, const std::shared_ptr & client_, const String & bucket_, - std::shared_ptr file_iterator_) + std::shared_ptr file_iterator_, + const size_t download_thread_num_) : SourceWithProgress(getHeader(sample_block_, need_path, need_file)) , WithContext(context_) , name(std::move(name_)) @@ -254,6 +255,7 @@ StorageS3Source::StorageS3Source( , with_file_column(need_file) , with_path_column(need_path) , file_iterator(file_iterator_) + , download_thread_num(download_thread_num_) { initialize(); } @@ -275,22 +277,7 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; - const auto max_download_threads = getContext()->getSettings().max_download_threads; - if (max_download_threads == 1) - { - read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()), - chooseCompressionMethod(current_key, compression_hint)); - } - else - { - size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false); - auto factory = std::make_unique(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings()); - - read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(std::move(factory), &IOThreadPool::get(), getContext()->getSettings().max_download_threads), - chooseCompressionMethod(current_key, compression_hint)); - } + read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint)); auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; @@ -298,10 +285,9 @@ bool StorageS3Source::initialize() if (columns_desc.hasDefaults()) { - builder.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, columns_desc, *input_format, getContext()); - }); + builder.addSimpleTransform( + [&](const Block & header) + { return std::make_shared(header, columns_desc, *input_format, getContext()); }); } pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); @@ -311,6 +297,43 @@ bool StorageS3Source::initialize() return true; } +std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & key) +{ + const size_t object_size = DB::S3::getObjectSize(client, bucket, key, false); + + auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; + const bool use_parallel_download = download_buffer_size > 0 && download_thread_num > 1; + const bool object_too_small = object_size < download_thread_num * download_buffer_size; + if (!use_parallel_download || object_too_small) + { + LOG_TRACE(&Poco::Logger::get("StorageS3Source"), "Downloading object of size {} from S3 in single thread", object_size); + return std::make_unique(client, bucket, key, max_single_read_retries, getContext()->getReadSettings()); + } + + assert(object_size > 0); + + if (download_buffer_size < DBMS_DEFAULT_BUFFER_SIZE) + { + LOG_WARNING( + &Poco::Logger::get("StorageS3Source"), + "Downloading buffer {} bytes too small, set at least {} bytes", + download_buffer_size, + DBMS_DEFAULT_BUFFER_SIZE); + download_buffer_size = DBMS_DEFAULT_BUFFER_SIZE; + } + + auto factory = std::make_unique( + client, bucket, key, download_buffer_size, object_size, max_single_read_retries, getContext()->getReadSettings()); + LOG_TRACE( + &Poco::Logger::get("StorageS3Source"), + "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", + download_thread_num, + object_size, + download_buffer_size); + + return std::make_unique(std::move(factory), &IOThreadPool::get(), download_thread_num); +} + String StorageS3Source::getName() const { return name; @@ -674,6 +697,8 @@ Pipe StorageS3::read( block_for_format = storage_snapshot->metadata->getSampleBlock(); } + const size_t max_download_threads = local_context->getSettingsRef().max_download_threads; + const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams); for (size_t i = 0; i < num_streams; ++i) { pipes.emplace_back(std::make_shared( @@ -690,7 +715,8 @@ Pipe StorageS3::read( compression_method, client_auth.client, client_auth.uri.bucket, - iterator_wrapper)); + iterator_wrapper, + download_threads_per_stream)); } auto pipe = Pipe::unitePipes(std::move(pipes)); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 300b7becb93..6c88f5bba56 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -74,7 +74,8 @@ public: String compression_hint_, const std::shared_ptr & client_, const String & bucket, - std::shared_ptr file_iterator_); + std::shared_ptr file_iterator_, + size_t download_thread_num); String getName() const override; @@ -105,9 +106,12 @@ private: bool with_file_column = false; bool with_path_column = false; std::shared_ptr file_iterator; + size_t download_thread_num = 1; /// Recreate ReadBuffer and BlockInputStream for each file. bool initialize(); + + std::unique_ptr createS3ReadBuffer(const String & key); }; /** From 93ad209910cb00c492bd727de0e5eed3baf2e55d Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 23 Mar 2022 11:52:31 +0000 Subject: [PATCH 07/58] Attach memory tracker --- src/Storages/StorageS3.cpp | 32 ++++++++++++++++++++++++++++---- src/Storages/StorageS3.h | 1 - 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 6966d356070..35a8405eeb2 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -293,7 +293,6 @@ bool StorageS3Source::initialize() pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); reader = std::make_unique(*pipeline); - initialized = false; return true; } @@ -331,7 +330,32 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k object_size, download_buffer_size); - return std::make_unique(std::move(factory), &IOThreadPool::get(), download_thread_num); + ThreadGroupStatusPtr running_group = CurrentThread::isInitialized() && CurrentThread::get().getThreadGroup() + ? CurrentThread::get().getThreadGroup() + : MainThreadStatus::getInstance().getThreadGroup(); + + ContextPtr query_context + = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr; + + auto worker_cleanup = [has_running_group = running_group == nullptr](ThreadStatus & thread_status) + { + if (has_running_group) + thread_status.detachQuery(false); + }; + + auto worker_setup = [query_context = std::move(query_context), + running_group = std::move(running_group)](ThreadStatus & thread_status) + { + /// Save query context if any, because cache implementation needs it. + if (query_context) + thread_status.attachQueryContext(query_context); + + /// To be able to pass ProfileEvents. + if (running_group) + thread_status.attachQuery(running_group); + }; + + return std::make_unique(std::move(factory), &IOThreadPool::get(), download_thread_num, worker_setup, worker_cleanup); } String StorageS3Source::getName() const @@ -698,7 +722,7 @@ Pipe StorageS3::read( } const size_t max_download_threads = local_context->getSettingsRef().max_download_threads; - const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams); + // const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams); for (size_t i = 0; i < num_streams; ++i) { pipes.emplace_back(std::make_shared( @@ -716,7 +740,7 @@ Pipe StorageS3::read( client_auth.client, client_auth.uri.bucket, iterator_wrapper, - download_threads_per_stream)); + max_download_threads)); } auto pipe = Pipe::unitePipes(std::move(pipes)); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 6c88f5bba56..6362e777df6 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -102,7 +102,6 @@ private: std::unique_ptr reader; /// onCancel and generate can be called concurrently std::mutex reader_mutex; - bool initialized = false; bool with_file_column = false; bool with_path_column = false; std::shared_ptr file_iterator; From 1a5f5c32bb2e2a96d22c0ef8132d29fc21db5aba Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 23 Mar 2022 14:36:25 +0000 Subject: [PATCH 08/58] Remove commented out code --- src/Storages/StorageS3.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 35a8405eeb2..b80b9ead53f 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -722,7 +722,6 @@ Pipe StorageS3::read( } const size_t max_download_threads = local_context->getSettingsRef().max_download_threads; - // const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams); for (size_t i = 0; i < num_streams; ++i) { pipes.emplace_back(std::make_shared( From e0d7b6dc3ed239ef4075add09090751d45941a4e Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 24 Mar 2022 09:30:06 +0000 Subject: [PATCH 09/58] Add tests for S3 multithreaded download --- tests/integration/test_storage_s3/test.py | 34 ++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index dd29d0a5d6a..7364af4d2dd 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -535,8 +535,9 @@ def test_multipart_put(started_cluster, maybe_auth, positive): one_line_length = 6 # 3 digits, 2 commas, 1 line separator. + total_rows = csv_size_bytes // one_line_length # Generate data having size more than one part - int_data = [[1, 2, 3] for i in range(csv_size_bytes // one_line_length)] + int_data = [[1, 2, 3] for i in range(total_rows)] csv_data = "".join(["{},{},{}\n".format(x, y, z) for x, y, z in int_data]) assert len(csv_data) > min_part_size_bytes @@ -573,6 +574,37 @@ def test_multipart_put(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) + # select uploaded data from many threads + select_query = ( + "select sum(column1), sum(column2), sum(column3) " + "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( + host=started_cluster.minio_redirect_host, + port=started_cluster.minio_redirect_port, + bucket=bucket, + filename=filename, + auth=maybe_auth, + table_format=table_format, + ) + ) + try: + select_result = run_query( + instance, + select_query, + settings={ + "max_download_threads": random.randint(4, 16), + "max_download_buffer_size": 1024 * 1024, + }, + ) + except helpers.client.QueryRuntimeException: + if positive: + raise + else: + assert positive + assert ( + select_result + == "\t".join(map(str, [total_rows, total_rows * 2, total_rows * 3])) + "\n" + ) + def test_remote_host_filter(started_cluster): instance = started_cluster.instances["restricted_dummy"] From 62a495a5fc64c24b83982a362c8e72b965f2fb09 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 24 Mar 2022 09:38:11 +0000 Subject: [PATCH 10/58] Small refactor --- src/IO/ReadBufferFromS3.cpp | 30 ++++++++++++++++ src/IO/ReadBufferFromS3.h | 42 ++++++----------------- src/IO/S3Common.cpp | 3 +- src/Storages/StorageS3.cpp | 3 +- tests/integration/test_storage_s3/test.py | 2 +- 5 files changed, 44 insertions(+), 36 deletions(-) diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 05e1276bc63..c78f41a0f9d 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -274,6 +274,36 @@ std::unique_ptr ReadBufferFromS3::initialize() throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR); } +SeekableReadBufferPtr ReadBufferS3Factory::getReader() +{ + const auto next_range = range_generator.nextRange(); + if (!next_range) + { + return nullptr; + } + + auto reader = std::make_shared( + client_ptr, + bucket, + key, + s3_max_single_read_retries, + read_settings, + false /*use_external_buffer*/, + next_range->first, + next_range->second); + return reader; +} + +off_t ReadBufferS3Factory::seek(off_t off, [[maybe_unused]] int whence) +{ + range_generator = RangeGenerator{object_size, range_step, static_cast(off)}; + return off; +} + +std::optional ReadBufferS3Factory::getTotalSize() +{ + return object_size; +} } #endif diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index d984fbebe5f..dbba751b321 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -5,15 +5,15 @@ #if USE_AWS_S3 -# include +#include -# include -# include -# include -# include -# include +#include +#include +#include +#include +#include -# include +#include namespace Aws::S3 { @@ -104,33 +104,11 @@ public: assert(range_step < object_size); } - SeekableReadBufferPtr getReader() override - { - const auto next_range = range_generator.nextRange(); - if (!next_range) - { - return nullptr; - } + SeekableReadBufferPtr getReader() override; - auto reader = std::make_shared( - client_ptr, - bucket, - key, - s3_max_single_read_retries, - read_settings, - false /*use_external_buffer*/, - next_range->first, - next_range->second); - return reader; - } + off_t seek(off_t off, [[maybe_unused]] int whence) override; - off_t seek(off_t off, [[maybe_unused]] int whence) override - { - range_generator = RangeGenerator{object_size, range_step, static_cast(off)}; - return off; - } - - std::optional getTotalSize() override { return object_size; } + std::optional getTotalSize() override; private: std::shared_ptr client_ptr; diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 971ac0d8a73..e706f0a75c1 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -24,6 +24,7 @@ # include # include # include +# include // Y_IGNORE # include # include @@ -33,8 +34,6 @@ # include # include -#include // Y_IGNORE -#include // Y_IGNORE namespace { diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index b80b9ead53f..5625ff16a10 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -355,7 +355,8 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k thread_status.attachQuery(running_group); }; - return std::make_unique(std::move(factory), &IOThreadPool::get(), download_thread_num, worker_setup, worker_cleanup); + return std::make_unique( + std::move(factory), &IOThreadPool::get(), download_thread_num, std::move(worker_setup), std::move(worker_cleanup)); } String StorageS3Source::getName() const diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 7364af4d2dd..7fc59d9d9ef 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -517,7 +517,7 @@ def test_put_get_with_globs(started_cluster): # ("'minio','minio123',",True), Redirect with credentials not working with nginx. ], ) -def test_multipart_put(started_cluster, maybe_auth, positive): +def test_multipart(started_cluster, maybe_auth, positive): # type: (ClickHouseCluster) -> None bucket = ( From 67195bfdd5f883e2e23e591bc78d2f7a56200c04 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Fri, 25 Mar 2022 21:00:00 +0000 Subject: [PATCH 11/58] support schema inference for type Object in format JSONEachRow --- src/Formats/JSONEachRowUtils.cpp | 20 +- src/Formats/ReadSchemaUtils.cpp | 40 ++++ src/Functions/FunctionsConversion.h | 173 +++++++++++++----- src/Interpreters/InterpreterInsertQuery.cpp | 1 + src/Interpreters/executeQuery.cpp | 16 +- src/Processors/Formats/ISchemaReader.h | 5 + .../01825_type_json_from_map.reference | 4 + .../0_stateless/01825_type_json_from_map.sql | 41 +++++ ...01825_type_json_schema_inference.reference | 8 + .../01825_type_json_schema_inference.sh | 52 ++++++ 10 files changed, 303 insertions(+), 57 deletions(-) create mode 100644 tests/queries/0_stateless/01825_type_json_from_map.reference create mode 100644 tests/queries/0_stateless/01825_type_json_from_map.sql create mode 100644 tests/queries/0_stateless/01825_type_json_schema_inference.reference create mode 100755 tests/queries/0_stateless/01825_type_json_schema_inference.sh diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index 66e0538fef1..80b9f84dafe 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -158,22 +159,29 @@ DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) { auto object = field.getObject(); DataTypePtr value_type; + bool is_map = true; for (const auto key_value_pair : object) { auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second); - if (!type) - return nullptr; + if (!type || isObject(type)) + { + is_map = false; + break; + } if (value_type && value_type->getName() != type->getName()) - return nullptr; + { + is_map = false; + break; + } value_type = type; } - if (!value_type) - return nullptr; + if (is_map && value_type) + return std::make_shared(std::make_shared(), value_type); - return std::make_shared(std::make_shared(), value_type); + return std::make_shared("json", false); } throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 559fac4cfaa..8e2531e2006 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include namespace DB { @@ -17,6 +19,28 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +static std::optional getOrderedColumnsList( + const NamesAndTypesList & columns_list, const Names & columns_order_hint) +{ + if (columns_list.size() != columns_order_hint.size()) + return {}; + + std::unordered_map available_columns; + for (const auto & [name, type] : columns_list) + available_columns.emplace(name, type); + + NamesAndTypesList res; + for (const auto & name : columns_order_hint) + { + auto it = available_columns.find(name); + if (it == available_columns.end()) + return {}; + + res.emplace_back(name, it->second); + } + return res; +} + ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, @@ -52,6 +76,22 @@ ColumnsDescription readSchemaFromFormat( { throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message()); } + + /// If we have "INSERT SELECT" query then try to order + /// columns as they are ordered in table schema for formats + /// without strict column order (like JSON and TSKV). + /// It will allow to execute simple data loading with query + /// "INSERT INTO table SELECT * FROM ..." + const auto & insertion_table = context->getInsertionTable(); + if (!schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty()) + { + auto storage = DatabaseCatalog::instance().getTable(insertion_table, context); + auto metadata = storage->getInMemoryMetadataPtr(); + auto names_in_storage = metadata->getColumns().getNamesOfPhysical(); + auto ordered_list = getOrderedColumnsList(names_and_types, names_in_storage); + if (ordered_list) + names_and_types = *ordered_list; + } } else throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} file format doesn't support schema inference", format_name); diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index e098378f51a..edddb200648 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -53,6 +53,7 @@ #include #include #include +#include namespace DB @@ -3140,52 +3141,138 @@ private: } } + WrapperType createTupleToObjectWrapper(const DataTypeTuple & from_tuple, bool has_nullable_subcolumns) const + { + if (!from_tuple.haveExplicitNames()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_tuple.getName()); + + PathsInData paths; + DataTypes from_types; + + std::tie(paths, from_types) = flattenTuple(from_tuple.getPtr()); + auto to_types = from_types; + + for (auto & type : to_types) + { + if (isTuple(type) || isNested(type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten Named Tuple. Got: {}", + from_tuple.getName()); + + type = recursiveRemoveLowCardinality(type); + } + + return [element_wrappers = getElementWrappers(from_types, to_types), + has_nullable_subcolumns, from_types, to_types, paths] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) + { + size_t tuple_size = to_types.size(); + auto flattened_column = flattenTuple(arguments.front().column); + const auto & column_tuple = assert_cast(*flattened_column); + + if (tuple_size != column_tuple.getColumns().size()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Expected tuple with {} subcolumn, but got {} subcolumns", + tuple_size, column_tuple.getColumns().size()); + + auto res = ColumnObject::create(has_nullable_subcolumns); + for (size_t i = 0; i < tuple_size; ++i) + { + ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }}; + auto converted_column = element_wrappers[i](element, to_types[i], nullable_source, input_rows_count); + res->addSubcolumn(paths[i], converted_column->assumeMutable()); + } + + return res; + }; + } + + WrapperType createMapToObjectWrapper(const DataTypeMap & from_map, bool has_nullable_subcolumns) const + { + auto key_value_types = from_map.getKeyValueTypes(); + + if (!isStringOrFixedString(key_value_types[0])) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object from Map can be performed only from Map " + "with String or FixedString key. Got: {}", from_map.getName()); + + const auto & value_type = key_value_types[1]; + auto to_value_type = value_type; + + if (!has_nullable_subcolumns && value_type->isNullable()) + to_value_type = removeNullable(value_type); + + if (has_nullable_subcolumns && !value_type->isNullable()) + to_value_type = makeNullable(value_type); + + DataTypes to_key_value_types{std::make_shared(), std::move(to_value_type)}; + auto element_wrappers = getElementWrappers(key_value_types, to_key_value_types); + + return [has_nullable_subcolumns, element_wrappers, key_value_types, to_key_value_types] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t) -> ColumnPtr + { + const auto & column_map = assert_cast(*arguments.front().column); + const auto & offsets = column_map.getNestedColumn().getOffsets(); + auto key_value_columns = column_map.getNestedData().getColumnsCopy(); + + for (size_t i = 0; i < 2; ++i) + { + ColumnsWithTypeAndName element{{key_value_columns[i], key_value_types[i], ""}}; + key_value_columns[i] = element_wrappers[i](element, to_key_value_types[i], nullable_source, key_value_columns[i]->size()); + } + + const auto & key_column_str = assert_cast(*key_value_columns[0]); + const auto & value_column = *key_value_columns[1]; + + using SubcolumnsMap = HashMap; + SubcolumnsMap subcolumns; + + for (size_t row = 0; row < offsets.size(); ++row) + { + for (size_t i = offsets[static_cast(row) - 1]; i < offsets[row]; ++i) + { + auto ref = key_column_str.getDataAt(i); + + bool inserted; + SubcolumnsMap::LookupResult it; + subcolumns.emplace(ref, it, inserted); + auto & subcolumn = it->getMapped(); + + if (inserted) + subcolumn = value_column.cloneEmpty()->cloneResized(row); + + /// Map can have duplicated keys. We insert only first one. + if (subcolumn->size() == row) + subcolumn->insertFrom(value_column, i); + } + + /// Insert default values for keys missed in current row. + for (const auto & [_, subcolumn] : subcolumns) + if (subcolumn->size() == row) + subcolumn->insertDefault(); + } + + auto column_object = ColumnObject::create(has_nullable_subcolumns); + for (auto && [key, subcolumn] : subcolumns) + { + PathInData path(key.toView()); + column_object->addSubcolumn(path, std::move(subcolumn)); + } + + return column_object; + }; + } + WrapperType createObjectWrapper(const DataTypePtr & from_type, const DataTypeObject * to_type) const { if (const auto * from_tuple = checkAndGetDataType(from_type.get())) { - if (!from_tuple->haveExplicitNames()) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_type->getName()); - - PathsInData paths; - DataTypes from_types; - - std::tie(paths, from_types) = flattenTuple(from_type); - auto to_types = from_types; - - for (auto & type : to_types) - { - if (isTuple(type) || isNested(type)) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_type->getName()); - - type = recursiveRemoveLowCardinality(type); - } - - return [element_wrappers = getElementWrappers(from_types, to_types), - has_nullable_subcolumns = to_type->hasNullableSubcolumns(), from_types, to_types, paths] - (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) - { - size_t tuple_size = to_types.size(); - auto flattened_column = flattenTuple(arguments.front().column); - const auto & column_tuple = assert_cast(*flattened_column); - - if (tuple_size != column_tuple.getColumns().size()) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Expected tuple with {} subcolumn, but got {} subcolumns", - tuple_size, column_tuple.getColumns().size()); - - auto res = ColumnObject::create(has_nullable_subcolumns); - for (size_t i = 0; i < tuple_size; ++i) - { - ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }}; - auto converted_column = element_wrappers[i](element, to_types[i], nullable_source, input_rows_count); - res->addSubcolumn(paths[i], converted_column->assumeMutable()); - } - - return res; - }; + return createTupleToObjectWrapper(*from_tuple, to_type->hasNullableSubcolumns()); + } + else if (const auto * from_map = checkAndGetDataType(from_type.get())) + { + return createMapToObjectWrapper(*from_map, to_type->hasNullableSubcolumns()); } else if (checkAndGetDataType(from_type.get())) { @@ -3199,7 +3286,7 @@ private: } throw Exception(ErrorCodes::TYPE_MISMATCH, - "Cast to Object can be performed only from flatten named tuple or string. Got: {}", from_type->getName()); + "Cast to Object can be performed only from flatten named Tuple, Map or String. Got: {}", from_type->getName()); } template diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index df44814a96e..ce369182766 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -358,6 +358,7 @@ BlockIO InterpreterInsertQuery::execute() auto new_context = Context::createCopy(context); new_context->setSettings(new_settings); + new_context->setInsertionTable(getContext()->getInsertionTable()); InterpreterSelectWithUnionQuery interpreter_select{ query.select, new_context, SelectQueryOptions(QueryProcessingStage::Complete, 1)}; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index c1606700540..89f5f4e6959 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -656,6 +656,14 @@ static std::tuple executeQueryImpl( limits.size_limits = SizeLimits(settings.max_result_rows, settings.max_result_bytes, settings.result_overflow_mode); } + if (const auto * insert_interpreter = typeid_cast(&*interpreter)) + { + /// Save insertion table (not table function). TODO: support remote() table function. + auto table_id = insert_interpreter->getDatabaseTable(); + if (!table_id.empty()) + context->setInsertionTable(std::move(table_id)); + } + { std::unique_ptr span; if (context->query_trace_context.trace_id != UUID()) @@ -666,14 +674,6 @@ static std::tuple executeQueryImpl( } res = interpreter->execute(); } - - if (const auto * insert_interpreter = typeid_cast(&*interpreter)) - { - /// Save insertion table (not table function). TODO: support remote() table function. - auto table_id = insert_interpreter->getDatabaseTable(); - if (!table_id.empty()) - context->setInsertionTable(std::move(table_id)); - } } if (process_list_entry) diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 2d35809e26a..36cf0656119 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -18,6 +18,10 @@ public: virtual NamesAndTypesList readSchema() = 0; + /// True if order of columns is important in format. + /// Exceptions: JSON, TSKV. + virtual bool hasStrictOrderOfColumns() const { return true; } + virtual ~ISchemaReader() = default; protected: @@ -60,6 +64,7 @@ class IRowWithNamesSchemaReader : public ISchemaReader public: IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); NamesAndTypesList readSchema() override; + bool hasStrictOrderOfColumns() const override { return false; } protected: /// Read one row and determine types of columns in it. diff --git a/tests/queries/0_stateless/01825_type_json_from_map.reference b/tests/queries/0_stateless/01825_type_json_from_map.reference new file mode 100644 index 00000000000..dbcf67faef3 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_from_map.reference @@ -0,0 +1,4 @@ +800000 2000000 1400000 900000 +800000 2000000 1400000 900000 +Tuple(col0 UInt64, col1 UInt64, col2 UInt64, col3 UInt64, col4 UInt64, col5 UInt64, col6 UInt64, col7 UInt64, col8 UInt64) +1600000 4000000 2800000 1800000 diff --git a/tests/queries/0_stateless/01825_type_json_from_map.sql b/tests/queries/0_stateless/01825_type_json_from_map.sql new file mode 100644 index 00000000000..2480aca1667 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_from_map.sql @@ -0,0 +1,41 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS t_json; +DROP TABLE IF EXISTS t_map; + +SET allow_experimental_object_type = 1; + +CREATE TABLE t_json(id UInt64, obj JSON) ENGINE = MergeTree ORDER BY id; +CREATE TABLE t_map(id UInt64, m Map(String, UInt64)) ENGINE = MergeTree ORDER BY id; + +INSERT INTO t_map +SELECT + number, + ( + arrayMap(x -> 'col' || toString(x), range(number % 10)), + range(number % 10) + )::Map(String, UInt64) +FROM numbers(1000000); + +INSERT INTO t_json SELECT id, m FROM t_map; +SELECT sum(m['col1']), sum(m['col4']), sum(m['col7']), sum(m['col8'] = 0) FROM t_map; +SELECT sum(obj.col1), sum(obj.col4), sum(obj.col7), sum(obj.col8 = 0) FROM t_json; +SELECT toTypeName(obj) FROM t_json LIMIT 1; + +INSERT INTO t_json +SELECT + number, + ( + arrayMap(x -> 'col' || toString(x), range(number % 10)), + range(number % 10) + )::Map(FixedString(4), UInt64) +FROM numbers(1000000); + +SELECT sum(obj.col1), sum(obj.col4), sum(obj.col7), sum(obj.col8 = 0) FROM t_json; + +INSERT INTO t_json +SELECT number, (range(number % 10), range(number % 10))::Map(UInt64, UInt64) +FROM numbers(1000000); -- { serverError 53 } + +DROP TABLE IF EXISTS t_json; +DROP TABLE IF EXISTS t_map; diff --git a/tests/queries/0_stateless/01825_type_json_schema_inference.reference b/tests/queries/0_stateless/01825_type_json_schema_inference.reference new file mode 100644 index 00000000000..c2c18b5a2ff --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_schema_inference.reference @@ -0,0 +1,8 @@ +{"id":"1","obj":{"k1":1,"k2":{"k3":"2","k4":[{"k5":3,"k6":0},{"k5":4,"k6":0}]},"some":0},"s":"foo"} +{"id":"2","obj":{"k1":0,"k2":{"k3":"str","k4":[{"k5":0,"k6":55}]},"some":42},"s":"bar"} +Tuple(k1 Int8, k2 Tuple(k3 String, k4 Nested(k5 Int8, k6 Int8)), some Int8) +{"id":"1","obj":"aaa","s":"foo"} +{"id":"2","obj":"bbb","s":"bar"} +{"map":{"k1":1,"k2":2},"obj":{"k1":1,"k2.k3":2},"map_type":"Map(String, Nullable(Float64))","obj_type":"Object('json')"} +{"obj":{"k1":1,"k2":2},"map":{"k1":"1","k2":"2"}} +Tuple(k1 Float64, k2 Float64) diff --git a/tests/queries/0_stateless/01825_type_json_schema_inference.sh b/tests/queries/0_stateless/01825_type_json_schema_inference.sh new file mode 100755 index 00000000000..4e8758761c5 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_schema_inference.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference" + +${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_json_inference (id UInt64, obj JSON, s String) \ + ENGINE = MergeTree ORDER BY id" --allow_experimental_object_type 1 + +user_files_path=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +mkdir -p ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/ +rm -rf ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME:?}/* + +filename="${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json" + +echo '{"id": 1, "obj": {"k1": 1, "k2": {"k3": 2, "k4": [{"k5": 3}, {"k5": 4}]}}, "s": "foo"}' > $filename +echo '{"id": 2, "obj": {"k2": {"k3": "str", "k4": [{"k6": 55}]}, "some": 42}, "s": "bar"}' >> $filename + +${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM t_json_inference FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 +${CLICKHOUSE_CLIENT} -q "SELECT toTypeName(obj) FROM t_json_inference LIMIT 1" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference" + +${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_json_inference (id UInt64, obj String, s String) ENGINE = MergeTree ORDER BY id" + +echo '{"obj": "aaa", "id": 1, "s": "foo"}' > $filename +echo '{"id": 2, "obj": "bbb", "s": "bar"}' >> $filename + +${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM t_json_inference FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference" + +echo '{"map": {"k1": 1, "k2": 2}, "obj": {"k1": 1, "k2": {"k3": 2}}}' > $filename + +${CLICKHOUSE_CLIENT} -q "SELECT map, obj, toTypeName(map) AS map_type, toTypeName(obj) AS obj_type \ + FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow') FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 + +${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_json_inference (obj JSON, map Map(String, UInt64)) \ + ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1 + +echo '{"map": {"k1": 1, "k2": 2}, "obj": {"k1": 1, "k2": 2}}' > $filename + +${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM t_json_inference FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 +${CLICKHOUSE_CLIENT} -q "SELECT toTypeName(obj) FROM t_json_inference LIMIT 1" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference" From a6450be8b625b26f188a0f176217226fe648de86 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Sat, 26 Mar 2022 01:11:16 +0000 Subject: [PATCH 12/58] fix schema inference --- src/Formats/JSONEachRowUtils.cpp | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index 80b9f84dafe..fb1ddb479f2 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -159,29 +159,37 @@ DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) { auto object = field.getObject(); DataTypePtr value_type; - bool is_map = true; + bool is_object = false; for (const auto key_value_pair : object) { auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second); - if (!type || isObject(type)) + if (!type) + continue; + + if (isObject(type)) { - is_map = false; + is_object = true; break; } - if (value_type && value_type->getName() != type->getName()) + if (!value_type) { - is_map = false; + value_type = type; + } + else if (!value_type->equals(*type)) + { + is_object = true; break; } - - value_type = type; } - if (is_map && value_type) + if (is_object) + return std::make_shared("json", false); + + if (value_type) return std::make_shared(std::make_shared(), value_type); - return std::make_shared("json", false); + return nullptr; } throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; From 58a78fba12da9e298611a2ecdd1558eff31bfe53 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Mon, 28 Mar 2022 12:58:22 +0800 Subject: [PATCH 13/58] formatString bug fix and refactoring --- src/Common/format.h | 176 +++++++++++++++++ src/Functions/concat.cpp | 24 ++- src/Functions/formatString.cpp | 24 ++- src/Functions/formatString.h | 180 +----------------- ...245_format_string_stack_overflow.reference | 1 + .../02245_format_string_stack_overflow.sql | 1 + 6 files changed, 209 insertions(+), 197 deletions(-) create mode 100644 src/Common/format.h create mode 100644 tests/queries/0_stateless/02245_format_string_stack_overflow.reference create mode 100644 tests/queries/0_stateless/02245_format_string_stack_overflow.sql diff --git a/src/Common/format.h b/src/Common/format.h new file mode 100644 index 00000000000..234ff90061c --- /dev/null +++ b/src/Common/format.h @@ -0,0 +1,176 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +namespace Format +{ + using IndexPositions = PODArrayWithStackMemory; + + static inline void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res, UInt64 argument_number) + { + res = 0; + for (UInt64 pos = l; pos < r; ++pos) + { + if (!isNumericASCII(description[pos])) + throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::BAD_ARGUMENTS); + res = res * 10 + description[pos] - '0'; + if (res >= argument_number) + throw Exception( + "Too big number for arguments, must be at most " + std::to_string(argument_number - 1), ErrorCodes::BAD_ARGUMENTS); + } + } + + static inline void init( + const String & pattern, + size_t argument_number, + const std::vector> & constant_strings, + IndexPositions & index_positions, + std::vector & substrings) + { + /// Is current position after open curly brace. + bool is_open_curly = false; + /// The position of last open token. + size_t last_open = -1; + + /// Is formatting in a plain {} token. + std::optional is_plain_numbering; + UInt64 index_if_plain = 0; + + /// Left position of adding substrings, just to the closed brace position or the start of the string. + /// Invariant --- the start of substring is in this position. + size_t start_pos = 0; + + /// A flag to decide whether we should glue the constant strings. + bool glue_to_next = false; + + /// Handling double braces (escaping). + auto double_brace_removal = [](String & str) + { + size_t i = 0; + bool should_delete = true; + str.erase( + std::remove_if( + str.begin(), + str.end(), + [&i, &should_delete, &str](char) + { + bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}'); + ++i; + if (is_double_brace && should_delete) + { + should_delete = false; + return true; + } + should_delete = true; + return false; + }), + str.end()); + }; + + index_positions.emplace_back(); + + for (size_t i = 0; i < pattern.size(); ++i) + { + if (pattern[i] == '{') + { + /// Escaping handling + /// It is safe to access because of null termination + if (pattern[i + 1] == '{') + { + ++i; + continue; + } + + if (is_open_curly) + throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS); + + String to_add = String(pattern.data() + start_pos, i - start_pos); + double_brace_removal(to_add); + if (!glue_to_next) + substrings.emplace_back(to_add); + else + substrings.back() += to_add; + + glue_to_next = false; + + is_open_curly = true; + last_open = i + 1; + } + else if (pattern[i] == '}') + { + if (pattern[i + 1] == '}') + { + ++i; + continue; + } + + if (!is_open_curly) + throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS); + + is_open_curly = false; + + if (last_open == i) + { + if (is_plain_numbering && !*is_plain_numbering) + throw Exception( + "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS); + is_plain_numbering = true; + if (index_if_plain >= argument_number) + throw Exception("Argument is too big for formatting", ErrorCodes::BAD_ARGUMENTS); + index_positions.back() = index_if_plain++; + } + else + { + if (is_plain_numbering && *is_plain_numbering) + throw Exception( + "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS); + is_plain_numbering = false; + + UInt64 arg; + parseNumber(pattern, last_open, i, arg, argument_number); + + if (arg >= argument_number) + throw Exception( + "Argument is too big for formatting. Note that indexing starts from zero", ErrorCodes::BAD_ARGUMENTS); + + index_positions.back() = arg; + } + + if (!constant_strings.empty() && constant_strings[index_positions.back()]) + { + /// The next string should be glued to last `A {} C`.format('B') -> `A B C`. + glue_to_next = true; + substrings.back() += *constant_strings[index_positions.back()]; + } + else + index_positions.emplace_back(); /// Otherwise we commit arg number and proceed. + + start_pos = i + 1; + } + } + + if (is_open_curly) + throw Exception("Last open curly brace is not closed", ErrorCodes::BAD_ARGUMENTS); + + String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos); + double_brace_removal(to_add); + + if (!glue_to_next) + substrings.emplace_back(to_add); + else + substrings.back() += to_add; + + index_positions.pop_back(); + } +} + +} diff --git a/src/Functions/concat.cpp b/src/Functions/concat.cpp index cd83223de3e..e11071265ce 100644 --- a/src/Functions/concat.cpp +++ b/src/Functions/concat.cpp @@ -52,23 +52,21 @@ public: { if (arguments.size() < 2) throw Exception( - "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size()) - + ", should be at least 2.", - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - - if (arguments.size() > FormatImpl::argument_threshold) - throw Exception( - "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size()) - + ", should be at most " + std::to_string(FormatImpl::argument_threshold), - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be at least 2", + getName(), + arguments.size()); for (const auto arg_idx : collections::range(0, arguments.size())) { const auto * arg = arguments[arg_idx].get(); if (!isStringOrFixedString(arg)) - throw Exception{"Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function " - + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}", + arg->getName(), + arg_idx + 1, + getName()); } return std::make_shared(); @@ -125,7 +123,7 @@ private: std::vector data(num_arguments); std::vector offsets(num_arguments); std::vector fixed_string_sizes(num_arguments); - std::vector constant_strings(num_arguments); + std::vector> constant_strings(num_arguments); bool has_column_string = false; bool has_column_fixed_string = false; for (size_t i = 0; i < num_arguments; ++i) diff --git a/src/Functions/formatString.cpp b/src/Functions/formatString.cpp index e138c072639..ee90a082e5b 100644 --- a/src/Functions/formatString.cpp +++ b/src/Functions/formatString.cpp @@ -45,25 +45,23 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (arguments.empty()) + if (arguments.size() < 2) throw Exception( - "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size()) - + ", should be at least 1", - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - - if (arguments.size() > FormatImpl::argument_threshold) - throw Exception( - "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size()) - + ", should be at most " + std::to_string(FormatImpl::argument_threshold), - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be at least 2", + getName(), + arguments.size()); for (const auto arg_idx : collections::range(0, arguments.size())) { const auto * arg = arguments[arg_idx].get(); if (!isStringOrFixedString(arg)) throw Exception( - "Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}", + arg->getName(), + arg_idx + 1, + getName()); } return std::make_shared(); @@ -84,7 +82,7 @@ public: std::vector data(arguments.size() - 1); std::vector offsets(arguments.size() - 1); std::vector fixed_string_sizes(arguments.size() - 1); - std::vector constant_strings(arguments.size() - 1); + std::vector> constant_strings(arguments.size() - 1); bool has_column_string = false; bool has_column_fixed_string = false; diff --git a/src/Functions/formatString.h b/src/Functions/formatString.h index 419ecf1c773..3c5057ecb97 100644 --- a/src/Functions/formatString.h +++ b/src/Functions/formatString.h @@ -4,8 +4,10 @@ #include #include #include +#include #include + #include #include #include @@ -22,8 +24,6 @@ namespace ErrorCodes struct FormatImpl { - static constexpr size_t small_argument_threshold = 1024; - static constexpr size_t argument_threshold = std::numeric_limits::max(); static constexpr size_t right_padding = 15; template @@ -39,165 +39,10 @@ struct FormatImpl format(std::forward(args)...); } - static void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res) - { - res = 0; - for (UInt64 pos = l; pos < r; ++pos) - { - if (!isNumericASCII(description[pos])) - throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::BAD_ARGUMENTS); - res = res * 10 + description[pos] - '0'; - if (res >= argument_threshold) - throw Exception( - "Too big number for arguments, must be at most " + std::to_string(argument_threshold), ErrorCodes::BAD_ARGUMENTS); - } - } - - static inline void init( - const String & pattern, - const std::vector & data, - size_t argument_number, - const std::vector & constant_strings, - UInt64 * index_positions_ptr, - std::vector & substrings) - { - /// Is current position after open curly brace. - bool is_open_curly = false; - /// The position of last open token. - size_t last_open = -1; - - /// Is formatting in a plain {} token. - std::optional is_plain_numbering; - UInt64 index_if_plain = 0; - - /// Left position of adding substrings, just to the closed brace position or the start of the string. - /// Invariant --- the start of substring is in this position. - size_t start_pos = 0; - - /// A flag to decide whether we should glue the constant strings. - bool glue_to_next = false; - - /// Handling double braces (escaping). - auto double_brace_removal = [](String & str) - { - size_t i = 0; - bool should_delete = true; - str.erase( - std::remove_if( - str.begin(), - str.end(), - [&i, &should_delete, &str](char) - { - bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}'); - ++i; - if (is_double_brace && should_delete) - { - should_delete = false; - return true; - } - should_delete = true; - return false; - }), - str.end()); - }; - - for (size_t i = 0; i < pattern.size(); ++i) - { - if (pattern[i] == '{') - { - /// Escaping handling - /// It is safe to access because of null termination - if (pattern[i + 1] == '{') - { - ++i; - continue; - } - - if (is_open_curly) - throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS); - - String to_add = String(pattern.data() + start_pos, i - start_pos); - double_brace_removal(to_add); - if (!glue_to_next) - substrings.emplace_back(to_add); - else - substrings.back() += to_add; - - glue_to_next = false; - - is_open_curly = true; - last_open = i + 1; - } - else if (pattern[i] == '}') - { - if (pattern[i + 1] == '}') - { - ++i; - continue; - } - - if (!is_open_curly) - throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS); - - is_open_curly = false; - - if (last_open == i) - { - if (is_plain_numbering && !*is_plain_numbering) - throw Exception( - "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS); - is_plain_numbering = true; - if (index_if_plain >= argument_number) - throw Exception("Argument is too big for formatting", ErrorCodes::BAD_ARGUMENTS); - *index_positions_ptr = index_if_plain++; - } - else - { - if (is_plain_numbering && *is_plain_numbering) - throw Exception( - "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS); - is_plain_numbering = false; - - UInt64 arg; - parseNumber(pattern, last_open, i, arg); - - if (arg >= argument_number) - throw Exception( - "Argument is too big for formatting. Note that indexing starts from zero", ErrorCodes::BAD_ARGUMENTS); - - *index_positions_ptr = arg; - } - - /// Constant string. - if (!data[*index_positions_ptr]) - { - /// The next string should be glued to last `A {} C`.format('B') -> `A B C`. - glue_to_next = true; - substrings.back() += constant_strings[*index_positions_ptr]; - } - else - ++index_positions_ptr; /// Otherwise we commit arg number and proceed. - - start_pos = i + 1; - } - } - - if (is_open_curly) - throw Exception("Last open curly brace is not closed", ErrorCodes::BAD_ARGUMENTS); - - String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos); - double_brace_removal(to_add); - - if (!glue_to_next) - substrings.emplace_back(to_add); - else - substrings.back() += to_add; - } - /// data for ColumnString and ColumnFixed. Nullptr means no data, it is const string. /// offsets for ColumnString, nullptr is an indicator that there is a fixed string rather than ColumnString. /// fixed_string_N for savings N to fixed strings. - /// constant_strings for constant strings. If data[i] is nullptr, than it is constant string. + /// constant_strings for constant strings. If data[i] is nullptr, it is constant string. /// res_data is result_data, res_offsets is offset result. /// input_rows_count is the number of rows processed. /// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size(). @@ -207,29 +52,22 @@ struct FormatImpl const std::vector & data, const std::vector & offsets, [[maybe_unused]] /* Because sometimes !has_column_fixed_string */ const std::vector & fixed_string_N, - const std::vector & constant_strings, + const std::vector> & constant_strings, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets, size_t input_rows_count) { const size_t argument_number = offsets.size(); - UInt64 small_index_positions_buffer[small_argument_threshold]; - /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this array will be filled with [1, 3, 1, 0, ... (garbage)] but without constant string indices. - UInt64 * index_positions = small_index_positions_buffer; - - std::unique_ptr big_index_positions_buffer; - if (argument_number > small_argument_threshold) - { - big_index_positions_buffer.reset(new UInt64[argument_number]); - index_positions = big_index_positions_buffer.get(); - } + /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this + /// array will be filled with [1, 3, 1, 0] but without constant string indices. + Format::IndexPositions index_positions; /// Vector of substrings of pattern that will be copied to the answer, not string view because of escaping and iterators invalidation. /// These are exactly what is between {} tokens, for `Hello {} world {}` we will have [`Hello `, ` world `, ``]. std::vector substrings; - init(pattern, data, argument_number, constant_strings, index_positions, substrings); + Format::init(pattern, argument_number, constant_strings, index_positions, substrings); UInt64 final_size = 0; @@ -271,7 +109,7 @@ struct FormatImpl for (size_t j = 1; j < substrings.size(); ++j) { UInt64 arg = index_positions[j - 1]; - auto offset_ptr = offsets[arg]; + const auto * offset_ptr = offsets[arg]; UInt64 arg_offset = 0; UInt64 size = 0; diff --git a/tests/queries/0_stateless/02245_format_string_stack_overflow.reference b/tests/queries/0_stateless/02245_format_string_stack_overflow.reference new file mode 100644 index 00000000000..6e163a64914 --- /dev/null +++ b/tests/queries/0_stateless/02245_format_string_stack_overflow.reference @@ -0,0 +1 @@ +00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 diff --git a/tests/queries/0_stateless/02245_format_string_stack_overflow.sql b/tests/queries/0_stateless/02245_format_string_stack_overflow.sql new file mode 100644 index 00000000000..1ee3606d3a6 --- /dev/null +++ b/tests/queries/0_stateless/02245_format_string_stack_overflow.sql @@ -0,0 +1 @@ +select format('{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}', toString(number)) str from numbers(1); From d9d826c813cafced31842be0ac3397b1c6292f60 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 28 Mar 2022 08:19:23 +0000 Subject: [PATCH 14/58] Address PR review --- src/Common/RangeGenerator.h | 2 +- src/IO/ReadBufferFromS3.cpp | 19 +++++++++++++------ src/IO/ReadBufferFromS3.h | 2 +- src/Storages/StorageS3.cpp | 17 ++++------------- src/Storages/StorageS3.h | 2 ++ 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/Common/RangeGenerator.h b/src/Common/RangeGenerator.h index 8d3b88936fc..8f097087c8c 100644 --- a/src/Common/RangeGenerator.h +++ b/src/Common/RangeGenerator.h @@ -34,7 +34,7 @@ public: Range range{from, to}; from = to; - return std::move(range); + return range; } private: diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index c78f41a0f9d..6616d92b492 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -1,4 +1,5 @@ #include +#include "IO/S3Common.h" #if USE_AWS_S3 @@ -212,13 +213,14 @@ std::optional ReadBufferFromS3::getTotalSize() if (file_size) return file_size; - Aws::S3::Model::HeadObjectRequest request; - request.SetBucket(bucket); - request.SetKey(key); + auto object_size = S3::getObjectSize(client_ptr, bucket, key, false); - auto outcome = client_ptr->HeadObject(request); - auto head_result = outcome.GetResultWithOwnership(); - file_size = head_result.GetContentLength(); + if (!object_size) + { + return std::nullopt; + } + + file_size = object_size; return file_size; } @@ -236,6 +238,11 @@ void ReadBufferFromS3::setReadUntilPosition(size_t position) } } +SeekableReadBuffer::Range ReadBufferFromS3::getRemainingReadRange() const +{ + return Range{.left = static_cast(offset), .right = read_until_position ? std::optional{read_until_position - 1} : std::nullopt}; +} + std::unique_ptr ReadBufferFromS3::initialize() { Aws::S3::Model::GetObjectRequest req; diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index dbba751b321..5c9d709d58e 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -63,7 +63,7 @@ public: void setReadUntilPosition(size_t position) override; - Range getRemainingReadRange() const override { return Range{.left = static_cast(offset), .right = read_until_position - 1}; } + Range getRemainingReadRange() const override; size_t getFileOffsetOfBufferEnd() const override { return offset; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index b40e352e1fb..411f78e44d6 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -307,7 +307,7 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k const bool object_too_small = object_size < download_thread_num * download_buffer_size; if (!use_parallel_download || object_too_small) { - LOG_TRACE(&Poco::Logger::get("StorageS3Source"), "Downloading object of size {} from S3 in single thread", object_size); + LOG_TRACE(log, "Downloading object of size {} from S3 in single thread", object_size); return std::make_unique(client, bucket, key, max_single_read_retries, getContext()->getReadSettings()); } @@ -315,29 +315,20 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k if (download_buffer_size < DBMS_DEFAULT_BUFFER_SIZE) { - LOG_WARNING( - &Poco::Logger::get("StorageS3Source"), - "Downloading buffer {} bytes too small, set at least {} bytes", - download_buffer_size, - DBMS_DEFAULT_BUFFER_SIZE); + LOG_WARNING(log, "Downloading buffer {} bytes too small, set at least {} bytes", download_buffer_size, DBMS_DEFAULT_BUFFER_SIZE); download_buffer_size = DBMS_DEFAULT_BUFFER_SIZE; } auto factory = std::make_unique( client, bucket, key, download_buffer_size, object_size, max_single_read_retries, getContext()->getReadSettings()); LOG_TRACE( - &Poco::Logger::get("StorageS3Source"), - "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", - download_thread_num, - object_size, - download_buffer_size); + log, "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", download_thread_num, object_size, download_buffer_size); ThreadGroupStatusPtr running_group = CurrentThread::isInitialized() && CurrentThread::get().getThreadGroup() ? CurrentThread::get().getThreadGroup() : MainThreadStatus::getInstance().getThreadGroup(); - ContextPtr query_context - = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr; + ContextPtr query_context = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr; auto worker_cleanup = [has_running_group = running_group == nullptr](ThreadStatus & thread_status) { diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 6362e777df6..e67223190ba 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -107,6 +107,8 @@ private: std::shared_ptr file_iterator; size_t download_thread_num = 1; + Poco::Logger * log = &Poco::Logger::get("StorageS3Source"); + /// Recreate ReadBuffer and BlockInputStream for each file. bool initialize(); From 6003a49a877908ac1d9f4651b7d507e6055e9a9b Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Mon, 28 Mar 2022 17:48:47 +0800 Subject: [PATCH 15/58] Fix style --- src/Common/format.h | 2 ++ src/Functions/formatString.h | 4 ---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/Common/format.h b/src/Common/format.h index 234ff90061c..a9382f247ab 100644 --- a/src/Common/format.h +++ b/src/Common/format.h @@ -1,3 +1,5 @@ +#pragma once + #include #include #include diff --git a/src/Functions/formatString.h b/src/Functions/formatString.h index 3c5057ecb97..44fbdac9378 100644 --- a/src/Functions/formatString.h +++ b/src/Functions/formatString.h @@ -17,10 +17,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; -} struct FormatImpl { From 065bc13aad41c6a564e5803d0408a91f640fec43 Mon Sep 17 00:00:00 2001 From: pdai Date: Mon, 28 Mar 2022 20:58:18 +0800 Subject: [PATCH 16/58] fix typo --- .../mergetree-family/versionedcollapsingmergetree.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md b/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md index b81d2206bf4..f5f2c428ea7 100644 --- a/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md +++ b/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md @@ -8,7 +8,7 @@ toc_title: "版本折叠MergeTree" 这个引擎: - 允许快速写入不断变化的对象状态。 -- 删除后台中的旧对象状态。 这显着降低了存储体积。 +- 删除后台中的旧对象状态。 这显著降低了存储体积。 请参阅部分 [崩溃](#table_engines_versionedcollapsingmergetree) 有关详细信息。 From 0e4af89f69e1340d27846f24faf14268ae0c4a10 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 28 Mar 2022 17:21:47 +0000 Subject: [PATCH 17/58] fix reading from type object --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 5 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- .../MergeTree/MergeTreeBlockReadUtils.cpp | 39 ++++++------ .../MergeTree/MergeTreeBlockReadUtils.h | 2 +- .../MergeTree/MergeTreeSequentialSource.cpp | 2 +- src/Storages/StorageSnapshot.cpp | 62 ++++++++++--------- src/Storages/StorageSnapshot.h | 4 ++ .../01825_type_json_missed_values.reference | 2 + .../01825_type_json_missed_values.sql | 19 ++++++ 9 files changed, 82 insertions(+), 55 deletions(-) create mode 100644 tests/queries/0_stateless/01825_type_json_missed_values.reference create mode 100644 tests/queries/0_stateless/01825_type_json_missed_values.sql diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 9028023dc80..6e1d73da427 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -558,9 +558,10 @@ size_t IMergeTreeDataPart::getFileSizeOrZero(const String & file_name) const return checksum->second.file_size; } -String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize(const StorageMetadataPtr & metadata_snapshot) const +String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize(const StorageSnapshotPtr & storage_snapshot) const { - const auto & storage_columns = metadata_snapshot->getColumns().getAllPhysical(); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects().withSubcolumns(); + auto storage_columns = storage_snapshot->getColumns(options); MergeTreeData::AlterConversions alter_conversions; if (!parent_part) alter_conversions = storage.getAlterConversionsForPart(shared_from_this()); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 85088b10f70..0dfac49bf30 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -164,7 +164,7 @@ public: /// Returns the name of a column with minimum compressed size (as returned by getColumnSize()). /// If no checksums are present returns the name of the first physically existing column. - String getColumnNameWithMinimumCompressedSize(const StorageMetadataPtr & metadata_snapshot) const; + String getColumnNameWithMinimumCompressedSize(const StorageSnapshotPtr & storage_snapshot) const; bool contains(const IMergeTreeDataPart & other) const { return info.contains(other.info); } diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index 6e72b843f10..97516ea06c6 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -24,7 +24,7 @@ namespace /// least one existing (physical) column in part. bool injectRequiredColumnsRecursively( const String & column_name, - const ColumnsDescription & storage_columns, + const StorageSnapshotPtr & storage_snapshot, const MergeTreeData::AlterConversions & alter_conversions, const MergeTreeData::DataPartPtr & part, Names & columns, @@ -36,7 +36,8 @@ bool injectRequiredColumnsRecursively( /// stages. checkStackSize(); - auto column_in_storage = storage_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns().withExtendedObjects(); + auto column_in_storage = storage_snapshot->tryGetColumn(options, column_name); if (column_in_storage) { auto column_name_in_part = column_in_storage->getNameInStorage(); @@ -63,7 +64,8 @@ bool injectRequiredColumnsRecursively( /// Column doesn't have default value and don't exist in part /// don't need to add to required set. - const auto column_default = storage_columns.getDefault(column_name); + auto metadata_snapshot = storage_snapshot->getMetadataForQuery(); + const auto column_default = metadata_snapshot->getColumns().getDefault(column_name); if (!column_default) return false; @@ -73,39 +75,36 @@ bool injectRequiredColumnsRecursively( bool result = false; for (const auto & identifier : identifiers) - result |= injectRequiredColumnsRecursively(identifier, storage_columns, alter_conversions, part, columns, required_columns, injected_columns); + result |= injectRequiredColumnsRecursively(identifier, storage_snapshot, alter_conversions, part, columns, required_columns, injected_columns); return result; } } -NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetadataPtr & metadata_snapshot, const MergeTreeData::DataPartPtr & part, Names & columns) +NameSet injectRequiredColumns( + const MergeTreeData & storage, + const StorageSnapshotPtr & storage_snapshot, + const MergeTreeData::DataPartPtr & part, + Names & columns) { NameSet required_columns{std::begin(columns), std::end(columns)}; NameSet injected_columns; bool have_at_least_one_physical_column = false; - - const auto & storage_columns = metadata_snapshot->getColumns(); MergeTreeData::AlterConversions alter_conversions; if (!part->isProjectionPart()) alter_conversions = storage.getAlterConversionsForPart(part); + for (size_t i = 0; i < columns.size(); ++i) { - auto name_in_storage = Nested::extractTableName(columns[i]); - if (storage_columns.has(name_in_storage) && isObject(storage_columns.get(name_in_storage).type)) - { - have_at_least_one_physical_column = true; - continue; - } - /// We are going to fetch only physical columns - if (!storage_columns.hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, columns[i])) - throw Exception("There is no physical column or subcolumn " + columns[i] + " in table.", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns().withExtendedObjects(); + if (!storage_snapshot->tryGetColumn(options, columns[i])) + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no physical column or subcolumn {} in table", columns[i]); have_at_least_one_physical_column |= injectRequiredColumnsRecursively( - columns[i], storage_columns, alter_conversions, + columns[i], storage_snapshot, alter_conversions, part, columns, required_columns, injected_columns); } @@ -115,7 +114,7 @@ NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetada */ if (!have_at_least_one_physical_column) { - const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(metadata_snapshot); + const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(storage_snapshot); columns.push_back(minimum_size_column_name); /// correctly report added column injected_columns.insert(columns.back()); @@ -271,7 +270,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( Names pre_column_names; /// inject columns required for defaults evaluation - bool should_reorder = !injectRequiredColumns(storage, storage_snapshot->getMetadataForQuery(), data_part, column_names).empty(); + bool should_reorder = !injectRequiredColumns(storage, storage_snapshot, data_part, column_names).empty(); if (prewhere_info) { @@ -296,7 +295,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( if (pre_column_names.empty()) pre_column_names.push_back(column_names[0]); - const auto injected_pre_columns = injectRequiredColumns(storage, storage_snapshot->getMetadataForQuery(), data_part, pre_column_names); + const auto injected_pre_columns = injectRequiredColumns(storage, storage_snapshot, data_part, pre_column_names); if (!injected_pre_columns.empty()) should_reorder = true; diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h index 2373881f954..b4293b4ce3d 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h @@ -22,7 +22,7 @@ using MergeTreeBlockSizePredictorPtr = std::shared_ptrrows_count); /// Add columns because we don't want to read empty blocks - injectRequiredColumns(storage, storage_snapshot->metadata, data_part, columns_to_read); + injectRequiredColumns(storage, storage_snapshot, data_part, columns_to_read); NamesAndTypesList columns_for_reader; if (take_column_types_from_storage) { diff --git a/src/Storages/StorageSnapshot.cpp b/src/Storages/StorageSnapshot.cpp index e214afc6a90..8a82c5387c5 100644 --- a/src/Storages/StorageSnapshot.cpp +++ b/src/Storages/StorageSnapshot.cpp @@ -51,40 +51,42 @@ NamesAndTypesList StorageSnapshot::getColumns(const GetColumnsOptions & options) NamesAndTypesList StorageSnapshot::getColumnsByNames(const GetColumnsOptions & options, const Names & names) const { NamesAndTypesList res; - const auto & columns = getMetadataForQuery()->getColumns(); for (const auto & name : names) + res.push_back(getColumn(options, name)); + return res; +} + +std::optional StorageSnapshot::tryGetColumn(const GetColumnsOptions & options, const String & column_name) const +{ + const auto & columns = getMetadataForQuery()->getColumns(); + auto column = columns.tryGetColumn(options, column_name); + if (column && (!isObject(column->type) || !options.with_extended_objects)) + return column; + + if (options.with_extended_objects) { - auto column = columns.tryGetColumn(options, name); - if (column && !isObject(column->type)) - { - res.emplace_back(std::move(*column)); - continue; - } - - if (options.with_extended_objects) - { - auto object_column = object_columns.tryGetColumn(options, name); - if (object_column) - { - res.emplace_back(std::move(*object_column)); - continue; - } - } - - if (options.with_virtuals) - { - auto it = virtual_columns.find(name); - if (it != virtual_columns.end()) - { - res.emplace_back(name, it->second); - continue; - } - } - - throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column {} in table", name); + auto object_column = object_columns.tryGetColumn(options, column_name); + if (object_column) + return object_column; } - return res; + if (options.with_virtuals) + { + auto it = virtual_columns.find(column_name); + if (it != virtual_columns.end()) + return NameAndTypePair(column_name, it->second); + } + + return {}; +} + +NameAndTypePair StorageSnapshot::getColumn(const GetColumnsOptions & options, const String & column_name) const +{ + auto column = tryGetColumn(options, column_name); + if (!column) + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column {} in table", column_name); + + return *column; } Block StorageSnapshot::getSampleBlockForColumns(const Names & column_names) const diff --git a/src/Storages/StorageSnapshot.h b/src/Storages/StorageSnapshot.h index 46244827f6c..909f4fd5cab 100644 --- a/src/Storages/StorageSnapshot.h +++ b/src/Storages/StorageSnapshot.h @@ -61,6 +61,10 @@ struct StorageSnapshot /// Get columns with types according to options only for requested names. NamesAndTypesList getColumnsByNames(const GetColumnsOptions & options, const Names & names) const; + /// Get column with type according to options for requested name. + std::optional tryGetColumn(const GetColumnsOptions & options, const String & column_name) const; + NameAndTypePair getColumn(const GetColumnsOptions & options, const String & column_name) const; + /// Block with ordinary + materialized + aliases + virtuals + subcolumns. Block getSampleBlockForColumns(const Names & column_names) const; diff --git a/tests/queries/0_stateless/01825_type_json_missed_values.reference b/tests/queries/0_stateless/01825_type_json_missed_values.reference new file mode 100644 index 00000000000..b480493995b --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_missed_values.reference @@ -0,0 +1,2 @@ +Tuple(foo Int8, k1 Int8, k2 Int8) +1 diff --git a/tests/queries/0_stateless/01825_type_json_missed_values.sql b/tests/queries/0_stateless/01825_type_json_missed_values.sql new file mode 100644 index 00000000000..2420ab7cf34 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_missed_values.sql @@ -0,0 +1,19 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS t_json; + +SET allow_experimental_object_type = 1; + +CREATE TABLE t_json(id UInt64, obj JSON) +ENGINE = MergeTree ORDER BY id +SETTINGS min_bytes_for_wide_part = 0; + +SYSTEM STOP MERGES t_json; + +INSERT INTO t_json SELECT number, '{"k1": 1, "k2": 2}' FROM numbers(1000000); +INSERT INTO t_json VALUES (1000001, '{"foo": 1}'); + +SELECT toTypeName(obj) FROM t_json LIMIT 1; +SELECT count() FROM t_json WHERE obj.foo != 0; + +DROP TABLE IF EXISTS t_json; From 24c0cf86d44682ea0d8fb967cb17837191381d82 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 28 Mar 2022 19:25:51 +0000 Subject: [PATCH 18/58] add function 'flattenTuple' --- src/Functions/flattenTuple.cpp | 68 +++++++++++++++++++ .../registerFunctionsMiscellaneous.cpp | 2 + .../0_stateless/02246_flatten_tuple.reference | 4 ++ .../0_stateless/02246_flatten_tuple.sql | 22 ++++++ 4 files changed, 96 insertions(+) create mode 100644 src/Functions/flattenTuple.cpp create mode 100644 tests/queries/0_stateless/02246_flatten_tuple.reference create mode 100644 tests/queries/0_stateless/02246_flatten_tuple.sql diff --git a/src/Functions/flattenTuple.cpp b/src/Functions/flattenTuple.cpp new file mode 100644 index 00000000000..f5d5b4cb773 --- /dev/null +++ b/src/Functions/flattenTuple.cpp @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +namespace +{ + +class FunctionFlattenTuple : public IFunction +{ +public: + static constexpr auto name = "flattenTuple"; + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 1; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto & type = arguments[0]; + const auto * type_tuple = checkAndGetDataType(type.get()); + if (!type_tuple || !type_tuple->haveExplicitNames()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Argument for function '{}' must be Named Tuple. Got '{}'", + getName(), type->getName()); + + auto [paths, types] = flattenTuple(type); + Names names; + names.reserve(paths.size()); + for (const auto & path : paths) + names.push_back(path.getPath()); + + return std::make_shared(types, names); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override + { + auto column = arguments.at(0).column; + if (!checkAndGetColumn(column.get())) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}. Expected ColumnTuple", + column->getName(), getName()); + + return flattenTuple(column); + } +}; + +} + +void registerFunctionFlattenTuple(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp index 76d61ce509a..40a8409cd15 100644 --- a/src/Functions/registerFunctionsMiscellaneous.cpp +++ b/src/Functions/registerFunctionsMiscellaneous.cpp @@ -80,6 +80,7 @@ void registerFunctionInitialQueryID(FunctionFactory & factory); void registerFunctionServerUUID(FunctionFactory &); void registerFunctionZooKeeperSessionUptime(FunctionFactory &); void registerFunctionGetOSKernelVersion(FunctionFactory &); +void registerFunctionFlattenTuple(FunctionFactory &); #if USE_ICU void registerFunctionConvertCharset(FunctionFactory &); @@ -166,6 +167,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory) registerFunctionServerUUID(factory); registerFunctionZooKeeperSessionUptime(factory); registerFunctionGetOSKernelVersion(factory); + registerFunctionFlattenTuple(factory); #if USE_ICU registerFunctionConvertCharset(factory); diff --git a/tests/queries/0_stateless/02246_flatten_tuple.reference b/tests/queries/0_stateless/02246_flatten_tuple.reference new file mode 100644 index 00000000000..0320150025d --- /dev/null +++ b/tests/queries/0_stateless/02246_flatten_tuple.reference @@ -0,0 +1,4 @@ +([1,2],['a','b'],3,'c',4) Tuple(`t1.a` Array(UInt32), `t1.s` Array(String), b UInt32, `t2.k` String, `t2.v` UInt32) +Tuple(id Int8, obj Tuple(k1 Int8, k2 Tuple(k3 String, k4 Nested(k5 Int8, k6 Int8)), some Int8), s String) Tuple(id Int8, `obj.k1` Int8, `obj.k2.k3` String, `obj.k2.k4.k5` Array(Int8), `obj.k2.k4.k6` Array(Int8), `obj.some` Int8, s String) +1 1 2 [3,4] [0,0] 0 foo +2 0 str [0] [55] 42 bar diff --git a/tests/queries/0_stateless/02246_flatten_tuple.sql b/tests/queries/0_stateless/02246_flatten_tuple.sql new file mode 100644 index 00000000000..e73f382b261 --- /dev/null +++ b/tests/queries/0_stateless/02246_flatten_tuple.sql @@ -0,0 +1,22 @@ +DROP TABLE IF EXISTS t_flatten_tuple; +DROP TABLE IF EXISTS t_flatten_object; + +SET flatten_nested = 0; + +CREATE TABLE t_flatten_tuple(t Tuple(t1 Nested(a UInt32, s String), b UInt32, t2 Tuple(k String, v UInt32))) ENGINE = Memory; + +INSERT INTO t_flatten_tuple VALUES (([(1, 'a'), (2, 'b')], 3, ('c', 4))); + +SELECT flattenTuple(t) AS ft, toTypeName(ft) FROM t_flatten_tuple; + +SET allow_experimental_object_type = 1; +CREATE TABLE t_flatten_object(data JSON) ENGINE = Memory; + +INSERT INTO t_flatten_object VALUES ('{"id": 1, "obj": {"k1": 1, "k2": {"k3": 2, "k4": [{"k5": 3}, {"k5": 4}]}}, "s": "foo"}'); +INSERT INTO t_flatten_object VALUES ('{"id": 2, "obj": {"k2": {"k3": "str", "k4": [{"k6": 55}]}, "some": 42}, "s": "bar"}'); + +SELECT toTypeName(data), toTypeName(flattenTuple(data)) FROM t_flatten_object LIMIT 1; +SELECT untuple(flattenTuple(data)) FROM t_flatten_object; + +DROP TABLE IF EXISTS t_flatten_tuple; +DROP TABLE IF EXISTS t_flatten_object; From 36ea9ef11fce7a8d8672d376606ccbe6f6547831 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 28 Mar 2022 19:52:10 +0000 Subject: [PATCH 19/58] Fix creating materialized view with subquery after server restart --- src/Databases/DatabaseOnDisk.cpp | 4 ++++ .../__init__.py | 0 .../test.py | 23 +++++++++++++++++++ 3 files changed, 27 insertions(+) create mode 100755 tests/integration/test_materialized_view_restart_server/__init__.py create mode 100755 tests/integration/test_materialized_view_restart_server/test.py diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 9f56b5f7676..f12009cef1a 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,9 @@ std::pair createTableFromAST( ast_create_query.attach = true; ast_create_query.setDatabase(database_name); + if (ast_create_query.select && ast_create_query.isView()) + ApplyWithSubqueryVisitor().visit(*ast_create_query.select); + if (ast_create_query.as_table_function) { const auto & factory = TableFunctionFactory::instance(); diff --git a/tests/integration/test_materialized_view_restart_server/__init__.py b/tests/integration/test_materialized_view_restart_server/__init__.py new file mode 100755 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_materialized_view_restart_server/test.py b/tests/integration/test_materialized_view_restart_server/test.py new file mode 100755 index 00000000000..828d8c279bf --- /dev/null +++ b/tests/integration/test_materialized_view_restart_server/test.py @@ -0,0 +1,23 @@ +import pytest +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance("node", stay_alive=True) + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def test_materialized_view_with_subquery(start_cluster): + node.query("create table test (x UInt32) engine=TineLog()") + node.query("create materialized view mv engine = TinyLog() as with subquery as (select * from test) select * from subquery") + node.restart_clickhouse(kill=True) + node.query("insert into test select 1") + result = node.query("select * from mv") + assert int(result) == 1 From c066593595bd70a16cc0c84bd88778da1d801167 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 28 Mar 2022 22:02:28 +0200 Subject: [PATCH 20/58] Update test.py --- .../integration/test_materialized_view_restart_server/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_materialized_view_restart_server/test.py b/tests/integration/test_materialized_view_restart_server/test.py index 828d8c279bf..4ede9f0f062 100755 --- a/tests/integration/test_materialized_view_restart_server/test.py +++ b/tests/integration/test_materialized_view_restart_server/test.py @@ -16,7 +16,9 @@ def start_cluster(): def test_materialized_view_with_subquery(start_cluster): node.query("create table test (x UInt32) engine=TineLog()") - node.query("create materialized view mv engine = TinyLog() as with subquery as (select * from test) select * from subquery") + node.query( + "create materialized view mv engine = TinyLog() as with subquery as (select * from test) select * from subquery" + ) node.restart_clickhouse(kill=True) node.query("insert into test select 1") result = node.query("select * from mv") From cd3cc60d5dbdf05bcf585bac7b2bd4c71d1177ed Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 28 Mar 2022 21:13:20 +0000 Subject: [PATCH 21/58] fix fasttest --- tests/queries/0_stateless/02246_flatten_tuple.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02246_flatten_tuple.sql b/tests/queries/0_stateless/02246_flatten_tuple.sql index e73f382b261..927e3b9ec80 100644 --- a/tests/queries/0_stateless/02246_flatten_tuple.sql +++ b/tests/queries/0_stateless/02246_flatten_tuple.sql @@ -1,3 +1,5 @@ +-- Tags: no-fasttest + DROP TABLE IF EXISTS t_flatten_tuple; DROP TABLE IF EXISTS t_flatten_object; From 98fe1ca1ecefed61b3ea4b4416aef4988546ab2c Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 29 Mar 2022 12:02:21 +0200 Subject: [PATCH 22/58] Fix --- programs/local/LocalServer.cpp | 9 +++---- ...6_clickhouse_local_drop_database.reference | 0 .../02246_clickhouse_local_drop_database.sh | 24 +++++++++++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/02246_clickhouse_local_drop_database.reference create mode 100755 tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 26d42a11315..bb6684ca137 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -184,6 +184,11 @@ void LocalServer::tryInitPath() if (path.back() != '/') path += '/'; + fs::create_directories(fs::path(path) / "user_defined/"); + fs::create_directories(fs::path(path) / "data/"); + fs::create_directories(fs::path(path) / "metadata/"); + fs::create_directories(fs::path(path) / "metadata_dropped/"); + global_context->setPath(path); global_context->setTemporaryStorage(path + "tmp"); @@ -565,7 +570,6 @@ void LocalServer::processConfig() /// Lock path directory before read status.emplace(fs::path(path) / "status", StatusFile::write_full_info); - fs::create_directories(fs::path(path) / "user_defined/"); LOG_DEBUG(log, "Loading user defined objects from {}", path); Poco::File(path + "user_defined/").createDirectories(); UserDefinedSQLObjectsLoader::instance().loadObjects(global_context); @@ -573,9 +577,6 @@ void LocalServer::processConfig() LOG_DEBUG(log, "Loaded user defined objects."); LOG_DEBUG(log, "Loading metadata from {}", path); - fs::create_directories(fs::path(path) / "data/"); - fs::create_directories(fs::path(path) / "metadata/"); - loadMetadataSystem(global_context); attachSystemTablesLocal(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::SYSTEM_DATABASE)); attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA)); diff --git a/tests/queries/0_stateless/02246_clickhouse_local_drop_database.reference b/tests/queries/0_stateless/02246_clickhouse_local_drop_database.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh b/tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh new file mode 100755 index 00000000000..00f3904192f --- /dev/null +++ b/tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +dir=${CLICKHOUSE_TEST_UNIQUE_NAME} +[[ -d $dir ]] && rm -r $dir +mkdir $dir +$CLICKHOUSE_LOCAL --multiline --multiquery --path $dir -q """ +DROP DATABASE IF EXISTS test; +CREATE DATABASE IF NOT EXISTS test; +USE test; +CREATE TABLE test (id Int32) ENGINE=MergeTree() ORDER BY id; +DROP DATABASE test; +""" + +$CLICKHOUSE_LOCAL --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test; +CREATE DATABASE IF NOT EXISTS test; +USE test; +CREATE TABLE test (id Int32) ENGINE=MergeTree() ORDER BY id; +DROP DATABASE test; +""" From f0b4a9fd15642240619dddfd4ebd5419b8b54e79 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 29 Mar 2022 10:56:49 +0000 Subject: [PATCH 23/58] Fix construction of ReadBufferFromS3 --- src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 8f91804bbbe..0d50b24f7a5 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -44,7 +44,7 @@ SeekableReadBufferPtr ReadBufferFromS3Gather::createImplementationBuffer(const S { return std::make_unique( client_ptr, bucket, fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries, - settings, /* use_external_buffer */true, read_until_position, /* restricted_seek */true); + settings, /* use_external_buffer */true, /* offset */ 0, read_until_position, /* restricted_seek */true); }; if (with_cache) From a2aab8ffacbfa74b9d86bdb938eb04b356152f89 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 29 Mar 2022 14:07:19 +0200 Subject: [PATCH 24/58] Fix test --- tests/integration/test_materialized_view_restart_server/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_materialized_view_restart_server/test.py b/tests/integration/test_materialized_view_restart_server/test.py index 4ede9f0f062..bfadec5136b 100755 --- a/tests/integration/test_materialized_view_restart_server/test.py +++ b/tests/integration/test_materialized_view_restart_server/test.py @@ -15,7 +15,7 @@ def start_cluster(): def test_materialized_view_with_subquery(start_cluster): - node.query("create table test (x UInt32) engine=TineLog()") + node.query("create table test (x UInt32) engine=TinyLog()") node.query( "create materialized view mv engine = TinyLog() as with subquery as (select * from test) select * from subquery" ) From c59926dcc7e3bb52be7b9b86ead120ae0d8b640d Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 29 Mar 2022 12:13:02 +0000 Subject: [PATCH 25/58] fix tests --- tests/queries/0_stateless/02246_flatten_tuple.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02246_flatten_tuple.sql b/tests/queries/0_stateless/02246_flatten_tuple.sql index 927e3b9ec80..139f7a621ef 100644 --- a/tests/queries/0_stateless/02246_flatten_tuple.sql +++ b/tests/queries/0_stateless/02246_flatten_tuple.sql @@ -18,7 +18,7 @@ INSERT INTO t_flatten_object VALUES ('{"id": 1, "obj": {"k1": 1, "k2": {"k3": 2, INSERT INTO t_flatten_object VALUES ('{"id": 2, "obj": {"k2": {"k3": "str", "k4": [{"k6": 55}]}, "some": 42}, "s": "bar"}'); SELECT toTypeName(data), toTypeName(flattenTuple(data)) FROM t_flatten_object LIMIT 1; -SELECT untuple(flattenTuple(data)) FROM t_flatten_object; +SELECT untuple(flattenTuple(data)) FROM t_flatten_object ORDER BY data.id; DROP TABLE IF EXISTS t_flatten_tuple; DROP TABLE IF EXISTS t_flatten_object; From ebd72e433d83dba85b3563af95b6aef015f8a4b8 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 14:41:47 +0200 Subject: [PATCH 26/58] Fail checks in case of errors --- tests/ci/build_report_check.py | 3 +++ tests/ci/compatibility_check.py | 3 +++ tests/ci/docker_images_check.py | 3 +++ tests/ci/docs_check.py | 3 +++ tests/ci/fast_test_check.py | 2 +- tests/ci/integration_test_check.py | 3 +++ tests/ci/performance_comparison_check.py | 3 +++ tests/ci/split_build_smoke_check.py | 3 +++ tests/ci/stress_check.py | 3 +++ tests/ci/style_check.py | 3 +++ tests/ci/unit_tests_check.py | 3 +++ 11 files changed, 31 insertions(+), 1 deletion(-) diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index 592e905bcb5..50315ad3d78 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -244,3 +244,6 @@ if __name__ == "__main__": state=summary_status, target_url=url, ) + + if summary_status == "error": + sys.exit(1) diff --git a/tests/ci/compatibility_check.py b/tests/ci/compatibility_check.py index d546fabf231..1f408ba0445 100644 --- a/tests/ci/compatibility_check.py +++ b/tests/ci/compatibility_check.py @@ -198,3 +198,6 @@ if __name__ == "__main__": CHECK_NAME, ) ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + + if state == "error": + sys.exit(1) diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index 3d0cc468aec..ef6d369bf4f 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -7,6 +7,7 @@ import platform import shutil import subprocess import time +import sys from typing import Dict, List, Optional, Set, Tuple, Union from github import Github @@ -461,6 +462,8 @@ def main(): ch_helper = ClickHouseHelper() ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + if status == "error": + sys.exit(1) if __name__ == "__main__": main() diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py index 58678b160a4..6e0c7e5d5fb 100644 --- a/tests/ci/docs_check.py +++ b/tests/ci/docs_check.py @@ -115,3 +115,6 @@ if __name__ == "__main__": NAME, ) ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + + if status == "error": + sys.exit(1) diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py index 64e04594786..40fa5bff091 100644 --- a/tests/ci/fast_test_check.py +++ b/tests/ci/fast_test_check.py @@ -208,7 +208,7 @@ if __name__ == "__main__": # Refuse other checks to run if fast test failed if state != "success": - if "force-tests" in pr_info.labels: + if "force-tests" in pr_info.labels and state != "error": print("'force-tests' enabled, will report success") else: sys.exit(1) diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py index 30009414d6e..7481beb2329 100644 --- a/tests/ci/integration_test_check.py +++ b/tests/ci/integration_test_check.py @@ -280,3 +280,6 @@ if __name__ == "__main__": check_name_with_group, ) ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + + if state == "error": + sys.exit(1) diff --git a/tests/ci/performance_comparison_check.py b/tests/ci/performance_comparison_check.py index c6ce86b2ce1..2e8a0de7100 100644 --- a/tests/ci/performance_comparison_check.py +++ b/tests/ci/performance_comparison_check.py @@ -217,3 +217,6 @@ if __name__ == "__main__": post_commit_status( gh, pr_info.sha, check_name_with_group, message, status, report_url ) + + if status == "error": + sys.exit(1) diff --git a/tests/ci/split_build_smoke_check.py b/tests/ci/split_build_smoke_check.py index 41ba6c2fedb..6629c2b3f71 100644 --- a/tests/ci/split_build_smoke_check.py +++ b/tests/ci/split_build_smoke_check.py @@ -148,3 +148,6 @@ if __name__ == "__main__": CHECK_NAME, ) ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + + if state == "error": + sys.exit(1) diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py index 32c181140e2..904ac38bffe 100644 --- a/tests/ci/stress_check.py +++ b/tests/ci/stress_check.py @@ -177,3 +177,6 @@ if __name__ == "__main__": check_name, ) ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + + if state == "error": + sys.exit(1) diff --git a/tests/ci/style_check.py b/tests/ci/style_check.py index 1b3037217c8..811e6d1d746 100644 --- a/tests/ci/style_check.py +++ b/tests/ci/style_check.py @@ -118,3 +118,6 @@ if __name__ == "__main__": NAME, ) ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + + if state == "error": + sys.exit(1) diff --git a/tests/ci/unit_tests_check.py b/tests/ci/unit_tests_check.py index 84c4faa822d..f86560562b9 100644 --- a/tests/ci/unit_tests_check.py +++ b/tests/ci/unit_tests_check.py @@ -174,3 +174,6 @@ if __name__ == "__main__": check_name, ) ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + + if state == "error": + sys.exit(1) From 3246261da8a3152bc0ecf0c8855120454d76877f Mon Sep 17 00:00:00 2001 From: tavplubix Date: Tue, 29 Mar 2022 15:43:42 +0300 Subject: [PATCH 27/58] Fix logging in `test_distributed_respect_user_timeouts` (#35575) * Update test.py * Update test.py * Update test.py Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- .../integration/test_distributed_respect_user_timeouts/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_distributed_respect_user_timeouts/test.py b/tests/integration/test_distributed_respect_user_timeouts/test.py index 9cf7082d63a..567377aba0b 100644 --- a/tests/integration/test_distributed_respect_user_timeouts/test.py +++ b/tests/integration/test_distributed_respect_user_timeouts/test.py @@ -94,7 +94,7 @@ def _check_exception(exception, expected_tries=3): @pytest.fixture(scope="module", params=["configs", "configs_secure"]) def started_cluster(request): - cluster = ClickHouseCluster(__file__) + cluster = ClickHouseCluster(__file__, request.param) cluster.__with_ssl_config = request.param == "configs_secure" main_configs = [] main_configs += [os.path.join(request.param, "config.d/remote_servers.xml")] From 0cc9c8212410e8bad45e5124f7beb6e98e17c8d9 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 29 Mar 2022 13:09:37 +0000 Subject: [PATCH 28/58] Try to fix test_global_overcommit_tracker flakyness --- tests/integration/test_global_overcommit_tracker/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_global_overcommit_tracker/test.py b/tests/integration/test_global_overcommit_tracker/test.py index c2f3a22915f..cacc447be1a 100644 --- a/tests/integration/test_global_overcommit_tracker/test.py +++ b/tests/integration/test_global_overcommit_tracker/test.py @@ -30,7 +30,7 @@ def test_overcommited_is_killed(): responses_A = list() responses_B = list() - for _ in range(100): + for _ in range(500): responses_A.append(node.get_query_request(TEST_QUERY_A, user="A")) responses_B.append(node.get_query_request(TEST_QUERY_B, user="B")) From 9990abb76afad4ac7c65895976d777ded0ca9d1c Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 28 Mar 2022 09:48:17 +0000 Subject: [PATCH 29/58] Use compile-time check for Exception messages, fix wrong messages --- src/Common/Exception.h | 22 +++++++++---------- src/Databases/DatabaseReplicatedWorker.cpp | 8 +++++-- src/Dictionaries/XDBCDictionarySource.cpp | 2 +- src/Disks/IO/CachedReadBufferFromRemoteFS.cpp | 9 ++++++-- src/Formats/FormatSchemaInfo.cpp | 9 +++++--- src/Functions/FunctionsStringArray.h | 2 +- src/Functions/IFunction.cpp | 9 +++++--- src/Functions/castOrDefault.cpp | 2 +- src/Functions/dateName.cpp | 2 +- src/Interpreters/CatBoostModel.cpp | 1 + src/Interpreters/DatabaseCatalog.cpp | 8 +++---- src/Interpreters/executeDDLQueryOnCluster.cpp | 19 ++++++++++------ .../Formats/Impl/MsgPackRowInputFormat.cpp | 4 ++-- 13 files changed, 59 insertions(+), 38 deletions(-) diff --git a/src/Common/Exception.h b/src/Common/Exception.h index 0bf89e7a447..c9bb8cc9f12 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -35,10 +35,10 @@ public: {} // Format message with fmt::format, like the logging functions. - template - Exception(int code, const std::string & fmt, Args&&... args) - : Exception(fmt::format(fmt::runtime(fmt), std::forward(args)...), code) - {} + template + Exception(int code, fmt::format_string fmt, Args &&... args) : Exception(fmt::format(fmt, std::forward(args)...), code) + { + } struct CreateFromPocoTag {}; struct CreateFromSTDTag {}; @@ -52,10 +52,10 @@ public: const char * what() const throw() override { return message().data(); } /// Add something to the existing message. - template - void addMessage(const std::string& format, Args&&... args) + template + void addMessage(fmt::format_string format, Args &&... args) { - extendedMessage(fmt::format(fmt::runtime(format), std::forward(args)...)); + extendedMessage(fmt::format(format, std::forward(args)...)); } void addMessage(const std::string& message) @@ -117,10 +117,10 @@ public: ParsingException(int code, const std::string & message); // Format message with fmt::format, like the logging functions. - template - ParsingException(int code, const std::string & fmt, Args&&... args) - : Exception(fmt::format(fmt::runtime(fmt), std::forward(args)...), code) - {} + template + ParsingException(int code, fmt::format_string fmt, Args &&... args) : Exception(code, fmt, std::forward(args)...) + { + } std::string displayText() const diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 365a5d02816..b45cfb16362 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -179,8 +179,12 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr if (!task->was_executed) { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} was executed, but was not committed: code {}: {}", - task->execution_status.code, task->execution_status.message); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Entry {} was executed, but was not committed: code {}: {}", + task->entry_name, + task->execution_status.code, + task->execution_status.message); } try_node->setAlreadyRemoved(); diff --git a/src/Dictionaries/XDBCDictionarySource.cpp b/src/Dictionaries/XDBCDictionarySource.cpp index f08abcdc516..764a7072ca0 100644 --- a/src/Dictionaries/XDBCDictionarySource.cpp +++ b/src/Dictionaries/XDBCDictionarySource.cpp @@ -50,7 +50,7 @@ namespace { if (!qualified_name.database.empty()) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Dictionary source of type {} specifies a schema but schema is not supported by {}-driver", + "Dictionary source specifies a schema but schema is not supported by {}-driver", bridge_.getName()); } diff --git a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp index 5cab2cb2995..4b614cd10e0 100644 --- a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp +++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp @@ -392,8 +392,13 @@ void CachedReadBufferFromRemoteFS::predownload(FileSegmentPtr & file_segment) if (bytes_to_predownload) throw Exception( ErrorCodes::LOGICAL_ERROR, - "Failed to predownload remaining {} bytes. Current file segment: {}, current download offset: {}, expected: {}, eof: {}", - file_segment->range().toString(), file_segment->getDownloadOffset(), file_offset_of_buffer_end, implementation_buffer->eof()); + "Failed to predownload remaining {} bytes. Current file segment: {}, current download offset: {}, expected: {}, " + "eof: {}", + bytes_to_predownload, + file_segment->range().toString(), + file_segment->getDownloadOffset(), + file_offset_of_buffer_end, + implementation_buffer->eof()); auto result = implementation_buffer->hasPendingData(); diff --git a/src/Formats/FormatSchemaInfo.cpp b/src/Formats/FormatSchemaInfo.cpp index 24c8dfc14f2..c1157be6e7a 100644 --- a/src/Formats/FormatSchemaInfo.cpp +++ b/src/Formats/FormatSchemaInfo.cpp @@ -85,9 +85,12 @@ FormatSchemaInfo::FormatSchemaInfo(const String & format_schema, const String & else if (path.has_parent_path() && !fs::weakly_canonical(default_schema_directory_path / path).string().starts_with(fs::weakly_canonical(default_schema_directory_path).string())) { if (is_server) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})", - path.string()); + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})", + default_schema_directory(), + path.string(), + default_schema_directory()); path = default_schema_directory_path / path; schema_path = path.filename(); schema_directory = path.parent_path() / ""; diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index a1256598f1b..6b3adf46ff5 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -259,7 +259,7 @@ public: throw Exception( ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function '{}' needs at least 2 arguments, at most 3 arguments; passed {}.", - arguments.size()); + name, arguments.size()); if (!isString(arguments[0])) throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.", diff --git a/src/Functions/IFunction.cpp b/src/Functions/IFunction.cpp index cfb4e12a025..19638c78daf 100644 --- a/src/Functions/IFunction.cpp +++ b/src/Functions/IFunction.cpp @@ -181,9 +181,12 @@ ColumnPtr IExecutableFunction::defaultImplementationForNulls( // Default implementation for nulls returns null result for null arguments, // so the result type must be nullable. if (!result_type->isNullable()) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Function {} with Null argument and default implementation for Nulls " - "is expected to return Nullable result, got {}", result_type->getName()); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Function {} with Null argument and default implementation for Nulls " + "is expected to return Nullable result, got {}", + getName(), + result_type->getName()); return result_type->createColumnConstWithDefaultValue(input_rows_count); } diff --git a/src/Functions/castOrDefault.cpp b/src/Functions/castOrDefault.cpp index 628ac57f34d..f7b93ec2e83 100644 --- a/src/Functions/castOrDefault.cpp +++ b/src/Functions/castOrDefault.cpp @@ -231,7 +231,7 @@ private: { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} decimal scale should have native UInt type. Actual {}", - scale_argument.type->getName()); + getName(), scale_argument.type->getName()); } scale = arguments[additional_argument_index].column->getUInt(0); diff --git a/src/Functions/dateName.cpp b/src/Functions/dateName.cpp index eef9bc3955b..91ea8995777 100644 --- a/src/Functions/dateName.cpp +++ b/src/Functions/dateName.cpp @@ -112,7 +112,7 @@ public: || (res = executeType(arguments, result_type)))) throw Exception( ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {} of function {], must be Date or DateTime.", + "Illegal column {} of function {}, must be Date or DateTime.", arguments[1].column->getName(), getName()); diff --git a/src/Interpreters/CatBoostModel.cpp b/src/Interpreters/CatBoostModel.cpp index cffaa81c4f0..d5803ed9e36 100644 --- a/src/Interpreters/CatBoostModel.cpp +++ b/src/Interpreters/CatBoostModel.cpp @@ -169,6 +169,7 @@ public: if (columns.size() != float_features_count + cat_features_count) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of columns is different with number of features: columns size {} float features size {} + cat features size {}", + columns.size(), float_features_count, cat_features_count); diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 360a5d430e0..2f51d942403 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -233,7 +233,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( { assert(!db_and_table.first && !db_and_table.second); if (exception) - exception->emplace(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs()); + exception->emplace(fmt::format("Table {} doesn't exist", table_id.getNameForLogs()), ErrorCodes::UNKNOWN_TABLE); return {}; } @@ -263,7 +263,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( /// If table_id has no UUID, then the name of database was specified by user and table_id was not resolved through context. /// Do not allow access to TEMPORARY_DATABASE because it contains all temporary tables of all contexts and users. if (exception) - exception->emplace(ErrorCodes::DATABASE_ACCESS_DENIED, "Direct access to `{}` database is not allowed", String(TEMPORARY_DATABASE)); + exception->emplace(fmt::format("Direct access to `{}` database is not allowed", TEMPORARY_DATABASE), ErrorCodes::DATABASE_ACCESS_DENIED); return {}; } @@ -274,7 +274,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( if (databases.end() == it) { if (exception) - exception->emplace(ErrorCodes::UNKNOWN_DATABASE, "Database {} doesn't exist", backQuoteIfNeed(table_id.getDatabaseName())); + exception->emplace(fmt::format("Database {} doesn't exist", backQuoteIfNeed(table_id.getDatabaseName())), ErrorCodes::UNKNOWN_DATABASE); return {}; } database = it->second; @@ -282,7 +282,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( auto table = database->tryGetTable(table_id.table_name, context_); if (!table && exception) - exception->emplace(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs()); + exception->emplace(fmt::format("Table {} doesn't exist", table_id.getNameForLogs()), ErrorCodes::UNKNOWN_TABLE); if (!table) database = nullptr; diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index ce00676b2ed..f0279bafca2 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -320,12 +320,13 @@ Chunk DDLQueryStatusSource::generate() if (throw_on_timeout) { if (!first_exception) - first_exception = std::make_unique(ErrorCodes::TIMEOUT_EXCEEDED, msg_format, - node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + first_exception = std::make_unique( + fmt::format(msg_format, node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts), + ErrorCodes::TIMEOUT_EXCEEDED); return {}; } - LOG_INFO(log, fmt::runtime(msg_format), node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + LOG_INFO(log, msg_format, node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); NameSet unfinished_hosts = waiting_hosts; for (const auto & host_id : finished_hosts) @@ -358,9 +359,12 @@ Chunk DDLQueryStatusSource::generate() /// Paradoxically, this exception will be throw even in case of "never_throw" mode. if (!first_exception) - first_exception = std::make_unique(ErrorCodes::UNFINISHED, - "Cannot provide query execution status. The query's node {} has been deleted by the cleaner" - " since it was finished (or its lifetime is expired)", node_path); + first_exception = std::make_unique( + fmt::format( + "Cannot provide query execution status. The query's node {} has been deleted by the cleaner" + " since it was finished (or its lifetime is expired)", + node_path), + ErrorCodes::UNFINISHED); return {}; } @@ -386,7 +390,8 @@ Chunk DDLQueryStatusSource::generate() if (status.code != 0 && !first_exception && context->getSettingsRef().distributed_ddl_output_mode != DistributedDDLOutputMode::NEVER_THROW) { - first_exception = std::make_unique(status.code, "There was an error on [{}:{}]: {}", host, port, status.message); + first_exception = std::make_unique( + fmt::format("There was an error on [{}:{}]: {}", host, port, status.message), status.code); } ++num_hosts_finished; diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index 607e6f36767..722cedbab30 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -359,7 +359,7 @@ bool MsgPackVisitor::visit_ext(const char * value, uint32_t size) return true; } - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported MsgPack extension type: {%x}", type); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported MsgPack extension type: {:x}", type); } void MsgPackVisitor::parse_error(size_t, size_t) // NOLINT @@ -498,7 +498,7 @@ DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) msgpack::object_ext object_ext = object.via.ext; if (object_ext.type() == int8_t(MsgPackExtensionTypes::UUIDType)) return std::make_shared(); - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack extension type {%x} is not supported", object_ext.type()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack extension type {:x} is not supported", object_ext.type()); } } __builtin_unreachable(); From ef484547bf9927c222c588113487a1534e61e39a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 29 Mar 2022 10:14:38 +0000 Subject: [PATCH 30/58] Fix GCC build --- src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp b/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp index a72d7b12a4f..ebfeaed8346 100644 --- a/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp +++ b/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp @@ -1,10 +1,13 @@ +// Needs to go first because its partial specialization of fmt::formatter +// should be defined before any instantiation +#include + #include #include #include #include -#include #include namespace DB From c9feb7f7515cf2711877fa40844a1b55b4e03267 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 29 Mar 2022 13:40:14 +0000 Subject: [PATCH 31/58] Format test file --- tests/integration/test_allowed_url_from_config/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_allowed_url_from_config/test.py b/tests/integration/test_allowed_url_from_config/test.py index 13f3929902d..01a2a500ebf 100644 --- a/tests/integration/test_allowed_url_from_config/test.py +++ b/tests/integration/test_allowed_url_from_config/test.py @@ -280,4 +280,4 @@ def test_HDFS(start_cluster): def test_schema_inference(start_cluster): error = node7.query_and_get_error("desc url('http://test.com`, 'TSVRaw'')") - assert(error.find('ReadWriteBufferFromHTTPBase') == -1) + assert error.find("ReadWriteBufferFromHTTPBase") == -1 From d0f01516db0c2d403f9d45898251bd417e78ad54 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 15:48:57 +0200 Subject: [PATCH 32/58] Resurrect automatic labelling --- .github/workflows/master.yml | 10 - tests/ci/cherry_pick_utils/parser.py | 8 - tests/ci/commit_status_helper.py | 5 + tests/ci/run_check.py | 34 +- utils/github/__init__.py | 1 - utils/github/backport.py | 185 ---------- utils/github/cherrypick.py | 323 ------------------ utils/github/local.py | 108 ------ utils/github/parser.py | 64 ---- utils/github/query.py | 492 --------------------------- 10 files changed, 31 insertions(+), 1199 deletions(-) delete mode 100644 utils/github/__init__.py delete mode 100644 utils/github/backport.py delete mode 100644 utils/github/cherrypick.py delete mode 100644 utils/github/local.py delete mode 100644 utils/github/parser.py delete mode 100644 utils/github/query.py diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 73cffc1d5d3..cfa95b84ee5 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -149,7 +149,6 @@ jobs: sudo rm -fr "$TEMP_PATH" SplitBuildSmokeTest: needs: [BuilderDebSplitted] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, style-checker] steps: - name: Set envs @@ -316,7 +315,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinRelease: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -362,7 +360,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinGCC: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -636,7 +633,6 @@ jobs: ########################################################################################## BuilderDebSplitted: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -682,7 +678,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinTidy: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -728,7 +723,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinDarwin: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -774,7 +768,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinAarch64: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -820,7 +813,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinFreeBSD: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -866,7 +858,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinDarwinAarch64: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -912,7 +903,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinPPC64: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs diff --git a/tests/ci/cherry_pick_utils/parser.py b/tests/ci/cherry_pick_utils/parser.py index d8348e6d964..29c05e5328f 100644 --- a/tests/ci/cherry_pick_utils/parser.py +++ b/tests/ci/cherry_pick_utils/parser.py @@ -20,8 +20,6 @@ class Description: def __init__(self, pull_request): self.label_name = str() - self.legal = False - self._parse(pull_request["bodyText"]) def _parse(self, text): @@ -39,12 +37,6 @@ class Description: category = stripped next_category = False - if ( - stripped - == "I hereby agree to the terms of the CLA available at: https://yandex.ru/legal/cla/?lang=en" - ): - self.legal = True - category_headers = ( "Category (leave one):", "Changelog category (leave one):", diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index e379c9a2254..53dfb145b44 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -59,3 +59,8 @@ def post_commit_status_to_file(file_path, description, state, report_url): with open(file_path, "w", encoding="utf-8") as f: out = csv.writer(f, delimiter="\t") out.writerow([state, report_url, description]) + +def post_label(gh, pr_info, label_name): + repo = gh.get_repo(GITHUB_REPOSITORY) + pull_request = repo.get_pull(pr_info.number) + pull_request.add_to_labels(label_name) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 9c7ba13f8e4..74e60582e8d 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -8,7 +8,7 @@ from github import Github from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL from pr_info import PRInfo from get_robot_token import get_best_robot_token -from commit_status_helper import get_commit +from commit_status_helper import get_commit, post_label NAME = "Run Check (actions)" @@ -81,6 +81,21 @@ TRUSTED_CONTRIBUTORS = { ] } +MAP_CATEGORY_TO_LABEL = { + "New Feature": "pr-feature", + "Bug Fix": "pr-bugfix", + "Bug Fix (user-visible misbehaviour in official stable or prestable release)": "pr-bugfix", + "Improvement": "pr-improvement", + "Performance Improvement": "pr-performance", + "Backward Incompatible Change": "pr-backward-incompatible", + "Build/Testing/Packaging Improvement": "pr-build", + "Not for changelog (changelog entry is not required)": "pr-not-for-changelog", + "Not for changelog": "pr-not-for-changelog", + "Documentation (changelog entry is not required)": "pr-documentation", + "Documentation": "pr-documentation", + # 'Other': doesn't match anything +} + def pr_is_by_trusted_user(pr_user_login, pr_user_orgs): if pr_user_login.lower() in TRUSTED_CONTRIBUTORS: @@ -168,7 +183,7 @@ def check_pr_description(pr_info): + second_category + "'" ) - return result_status[:140] + return result_status[:140], category elif re.match( r"(?i)^[>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] @@ -190,19 +205,19 @@ def check_pr_description(pr_info): i += 1 if not category: - return "Changelog category is empty" + return "Changelog category is empty", category # Filter out the PR categories that are not for changelog. if re.match( r"(?i)doc|((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)", category, ): - return "" + return "", category if not entry: - return f"Changelog entry required for category '{category}'" + return f"Changelog entry required for category '{category}'", category - return "" + return "", category if __name__ == "__main__": @@ -213,7 +228,10 @@ if __name__ == "__main__": gh = Github(get_best_robot_token()) commit = get_commit(gh, pr_info.sha) - description_report = check_pr_description(pr_info)[:139] + description_report, category = check_pr_description(pr_info) + if category in MAP_CATEGORY_TO_LABEL: + post_label(gh, pr_info, MAP_CATEGORY_TO_LABEL[category]) + if description_report: print("::notice ::Cannot run, description does not match the template") logging.info( @@ -225,7 +243,7 @@ if __name__ == "__main__": ) commit.create_status( context=NAME, - description=description_report, + description=description_report[:139], state="failure", target_url=url, ) diff --git a/utils/github/__init__.py b/utils/github/__init__.py deleted file mode 100644 index 40a96afc6ff..00000000000 --- a/utils/github/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- diff --git a/utils/github/backport.py b/utils/github/backport.py deleted file mode 100644 index 615c0d19ffa..00000000000 --- a/utils/github/backport.py +++ /dev/null @@ -1,185 +0,0 @@ -# -*- coding: utf-8 -*- - -try: - from clickhouse.utils.github.cherrypick import CherryPick - from clickhouse.utils.github.query import Query as RemoteRepo - from clickhouse.utils.github.local import Repository as LocalRepo -except: - from .cherrypick import CherryPick - from .query import Query as RemoteRepo - from .local import Repository as LocalRepo - -import argparse -import logging -import re -import sys - - -class Backport: - def __init__(self, token, owner, name, team): - self._gh = RemoteRepo( - token, owner=owner, name=name, team=team, max_page_size=30, min_page_size=7 - ) - self._token = token - self.default_branch_name = self._gh.default_branch - self.ssh_url = self._gh.ssh_url - - def getPullRequests(self, from_commit): - return self._gh.get_pull_requests(from_commit) - - def getBranchesWithRelease(self): - branches = set() - for pull_request in self._gh.find_pull_requests("release"): - branches.add(pull_request["headRefName"]) - return branches - - def execute(self, repo, upstream, until_commit, run_cherrypick): - repo = LocalRepo(repo, upstream, self.default_branch_name) - all_branches = repo.get_release_branches() # [(branch_name, base_commit)] - - release_branches = self.getBranchesWithRelease() - - branches = [] - # iterate over all branches to preserve their precedence. - for branch in all_branches: - if branch[0] in release_branches: - branches.append(branch) - - if not branches: - logging.info("No release branches found!") - return - - for branch in branches: - logging.info("Found release branch: %s", branch[0]) - - if not until_commit: - until_commit = branches[0][1] - pull_requests = self.getPullRequests(until_commit) - - backport_map = {} - - RE_MUST_BACKPORT = re.compile(r"^v(\d+\.\d+)-must-backport$") - RE_NO_BACKPORT = re.compile(r"^v(\d+\.\d+)-no-backport$") - RE_BACKPORTED = re.compile(r"^v(\d+\.\d+)-backported$") - - # pull-requests are sorted by ancestry from the most recent. - for pr in pull_requests: - while repo.comparator(branches[-1][1]) >= repo.comparator( - pr["mergeCommit"]["oid"] - ): - logging.info( - "PR #{} is already inside {}. Dropping this branch for further PRs".format( - pr["number"], branches[-1][0] - ) - ) - branches.pop() - - logging.info("Processing PR #{}".format(pr["number"])) - - assert len(branches) - - branch_set = set([branch[0] for branch in branches]) - - # First pass. Find all must-backports - for label in pr["labels"]["nodes"]: - if label["name"] == "pr-must-backport": - backport_map[pr["number"]] = branch_set.copy() - continue - matched = RE_MUST_BACKPORT.match(label["name"]) - if matched: - if pr["number"] not in backport_map: - backport_map[pr["number"]] = set() - backport_map[pr["number"]].add(matched.group(1)) - - # Second pass. Find all no-backports - for label in pr["labels"]["nodes"]: - if label["name"] == "pr-no-backport" and pr["number"] in backport_map: - del backport_map[pr["number"]] - break - matched_no_backport = RE_NO_BACKPORT.match(label["name"]) - matched_backported = RE_BACKPORTED.match(label["name"]) - if ( - matched_no_backport - and pr["number"] in backport_map - and matched_no_backport.group(1) in backport_map[pr["number"]] - ): - backport_map[pr["number"]].remove(matched_no_backport.group(1)) - logging.info( - "\tskipping %s because of forced no-backport", - matched_no_backport.group(1), - ) - elif ( - matched_backported - and pr["number"] in backport_map - and matched_backported.group(1) in backport_map[pr["number"]] - ): - backport_map[pr["number"]].remove(matched_backported.group(1)) - logging.info( - "\tskipping %s because it's already backported manually", - matched_backported.group(1), - ) - - for pr, branches in list(backport_map.items()): - logging.info("PR #%s needs to be backported to:", pr) - for branch in branches: - logging.info( - "\t%s, and the status is: %s", - branch, - run_cherrypick(self._token, pr, branch), - ) - - # print API costs - logging.info("\nGitHub API total costs per query:") - for name, value in list(self._gh.api_costs.items()): - logging.info("%s : %s", name, value) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--token", type=str, required=True, help="token for Github access" - ) - parser.add_argument( - "--repo", - type=str, - required=True, - help="path to full repository", - metavar="PATH", - ) - parser.add_argument( - "--til", type=str, help="check PRs from HEAD til this commit", metavar="COMMIT" - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="do not create or merge any PRs", - default=False, - ) - parser.add_argument( - "--verbose", - "-v", - action="store_true", - help="more verbose output", - default=False, - ) - parser.add_argument( - "--upstream", - "-u", - type=str, - help="remote name of upstream in repository", - default="origin", - ) - args = parser.parse_args() - - if args.verbose: - logging.basicConfig( - format="%(message)s", stream=sys.stdout, level=logging.DEBUG - ) - else: - logging.basicConfig(format="%(message)s", stream=sys.stdout, level=logging.INFO) - - cherrypick_run = lambda token, pr, branch: CherryPick( - token, "ClickHouse", "ClickHouse", "core", pr, branch - ).execute(args.repo, args.dry_run) - bp = Backport(args.token, "ClickHouse", "ClickHouse", "core") - bp.execute(args.repo, args.upstream, args.til, cherrypick_run) diff --git a/utils/github/cherrypick.py b/utils/github/cherrypick.py deleted file mode 100644 index c6469fa62a9..00000000000 --- a/utils/github/cherrypick.py +++ /dev/null @@ -1,323 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Backports changes from PR to release branch. -Requires multiple separate runs as part of the implementation. - -First run should do the following: -1. Merge release branch with a first parent of merge-commit of PR (using 'ours' strategy). (branch: backport/{branch}/{pr}) -2. Create temporary branch over merge-commit to use it for PR creation. (branch: cherrypick/{merge_commit}) -3. Create PR from temporary branch to backport branch (emulating cherry-pick). - -Second run checks PR from previous run to be merged or at least being mergeable. If it's not merged then try to merge it. - -Third run creates PR from backport branch (with merged previous PR) to release branch. -""" - -try: - from clickhouse.utils.github.query import Query as RemoteRepo -except: - from .query import Query as RemoteRepo - -import argparse -from enum import Enum -import logging -import os -import subprocess -import sys - - -class CherryPick: - class Status(Enum): - DISCARDED = "discarded" - NOT_INITIATED = "not started" - FIRST_MERGEABLE = "waiting for 1st stage" - FIRST_CONFLICTS = "conflicts on 1st stage" - SECOND_MERGEABLE = "waiting for 2nd stage" - SECOND_CONFLICTS = "conflicts on 2nd stage" - MERGED = "backported" - - def _run(self, args): - out = subprocess.check_output(args).rstrip() - logging.debug(out) - return out - - def __init__(self, token, owner, name, team, pr_number, target_branch): - self._gh = RemoteRepo(token, owner=owner, name=name, team=team) - self._pr = self._gh.get_pull_request(pr_number) - - self.ssh_url = self._gh.ssh_url - - # TODO: check if pull-request is merged. - - self.merge_commit_oid = self._pr["mergeCommit"]["oid"] - - self.target_branch = target_branch - self.backport_branch = "backport/{branch}/{pr}".format( - branch=target_branch, pr=pr_number - ) - self.cherrypick_branch = "cherrypick/{branch}/{oid}".format( - branch=target_branch, oid=self.merge_commit_oid - ) - - def getCherryPickPullRequest(self): - return self._gh.find_pull_request( - base=self.backport_branch, head=self.cherrypick_branch - ) - - def createCherryPickPullRequest(self, repo_path): - DESCRIPTION = ( - "This pull-request is a first step of an automated backporting.\n" - "It contains changes like after calling a local command `git cherry-pick`.\n" - "If you intend to continue backporting this changes, then resolve all conflicts if any.\n" - "Otherwise, if you do not want to backport them, then just close this pull-request.\n" - "\n" - "The check results does not matter at this step - you can safely ignore them.\n" - "Also this pull-request will be merged automatically as it reaches the mergeable state, but you always can merge it manually.\n" - ) - - # FIXME: replace with something better than os.system() - git_prefix = [ - "git", - "-C", - repo_path, - "-c", - "user.email=robot-clickhouse@yandex-team.ru", - "-c", - "user.name=robot-clickhouse", - ] - base_commit_oid = self._pr["mergeCommit"]["parents"]["nodes"][0]["oid"] - - # Create separate branch for backporting, and make it look like real cherry-pick. - self._run(git_prefix + ["checkout", "-f", self.target_branch]) - self._run(git_prefix + ["checkout", "-B", self.backport_branch]) - self._run(git_prefix + ["merge", "-s", "ours", "--no-edit", base_commit_oid]) - - # Create secondary branch to allow pull request with cherry-picked commit. - self._run( - git_prefix + ["branch", "-f", self.cherrypick_branch, self.merge_commit_oid] - ) - - self._run( - git_prefix - + [ - "push", - "-f", - "origin", - "{branch}:{branch}".format(branch=self.backport_branch), - ] - ) - self._run( - git_prefix - + [ - "push", - "-f", - "origin", - "{branch}:{branch}".format(branch=self.cherrypick_branch), - ] - ) - - # Create pull-request like a local cherry-pick - pr = self._gh.create_pull_request( - source=self.cherrypick_branch, - target=self.backport_branch, - title="Cherry pick #{number} to {target}: {title}".format( - number=self._pr["number"], - target=self.target_branch, - title=self._pr["title"].replace('"', '\\"'), - ), - description="Original pull-request #{}\n\n{}".format( - self._pr["number"], DESCRIPTION - ), - ) - - # FIXME: use `team` to leave a single eligible assignee. - self._gh.add_assignee(pr, self._pr["author"]) - self._gh.add_assignee(pr, self._pr["mergedBy"]) - - self._gh.set_label(pr, "do not test") - self._gh.set_label(pr, "pr-cherrypick") - - return pr - - def mergeCherryPickPullRequest(self, cherrypick_pr): - return self._gh.merge_pull_request(cherrypick_pr["id"]) - - def getBackportPullRequest(self): - return self._gh.find_pull_request( - base=self.target_branch, head=self.backport_branch - ) - - def createBackportPullRequest(self, cherrypick_pr, repo_path): - DESCRIPTION = ( - "This pull-request is a last step of an automated backporting.\n" - "Treat it as a standard pull-request: look at the checks and resolve conflicts.\n" - "Merge it only if you intend to backport changes to the target branch, otherwise just close it.\n" - ) - - git_prefix = [ - "git", - "-C", - repo_path, - "-c", - "user.email=robot-clickhouse@clickhouse.com", - "-c", - "user.name=robot-clickhouse", - ] - - pr_title = "Backport #{number} to {target}: {title}".format( - number=self._pr["number"], - target=self.target_branch, - title=self._pr["title"].replace('"', '\\"'), - ) - - self._run(git_prefix + ["checkout", "-f", self.backport_branch]) - self._run(git_prefix + ["pull", "--ff-only", "origin", self.backport_branch]) - self._run( - git_prefix - + [ - "reset", - "--soft", - self._run( - git_prefix - + [ - "merge-base", - "origin/" + self.target_branch, - self.backport_branch, - ] - ), - ] - ) - self._run(git_prefix + ["commit", "-a", "--allow-empty", "-m", pr_title]) - self._run( - git_prefix - + [ - "push", - "-f", - "origin", - "{branch}:{branch}".format(branch=self.backport_branch), - ] - ) - - pr = self._gh.create_pull_request( - source=self.backport_branch, - target=self.target_branch, - title=pr_title, - description="Original pull-request #{}\nCherry-pick pull-request #{}\n\n{}".format( - self._pr["number"], cherrypick_pr["number"], DESCRIPTION - ), - ) - - # FIXME: use `team` to leave a single eligible assignee. - self._gh.add_assignee(pr, self._pr["author"]) - self._gh.add_assignee(pr, self._pr["mergedBy"]) - - self._gh.set_label(pr, "pr-backport") - - return pr - - def execute(self, repo_path, dry_run=False): - pr1 = self.getCherryPickPullRequest() - if not pr1: - if not dry_run: - pr1 = self.createCherryPickPullRequest(repo_path) - logging.debug( - "Created PR with cherry-pick of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr1["url"], - ) - else: - return CherryPick.Status.NOT_INITIATED - else: - logging.debug( - "Found PR with cherry-pick of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr1["url"], - ) - - if not pr1["merged"] and pr1["mergeable"] == "MERGEABLE" and not pr1["closed"]: - if not dry_run: - pr1 = self.mergeCherryPickPullRequest(pr1) - logging.debug( - "Merged PR with cherry-pick of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr1["url"], - ) - - if not pr1["merged"]: - logging.debug( - "Waiting for PR with cherry-pick of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr1["url"], - ) - - if pr1["closed"]: - return CherryPick.Status.DISCARDED - elif pr1["mergeable"] == "CONFLICTING": - return CherryPick.Status.FIRST_CONFLICTS - else: - return CherryPick.Status.FIRST_MERGEABLE - - pr2 = self.getBackportPullRequest() - if not pr2: - if not dry_run: - pr2 = self.createBackportPullRequest(pr1, repo_path) - logging.debug( - "Created PR with backport of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr2["url"], - ) - else: - return CherryPick.Status.FIRST_MERGEABLE - else: - logging.debug( - "Found PR with backport of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr2["url"], - ) - - if pr2["merged"]: - return CherryPick.Status.MERGED - elif pr2["closed"]: - return CherryPick.Status.DISCARDED - elif pr2["mergeable"] == "CONFLICTING": - return CherryPick.Status.SECOND_CONFLICTS - else: - return CherryPick.Status.SECOND_MERGEABLE - - -if __name__ == "__main__": - logging.basicConfig(format="%(message)s", stream=sys.stdout, level=logging.DEBUG) - - parser = argparse.ArgumentParser() - parser.add_argument( - "--token", "-t", type=str, required=True, help="token for Github access" - ) - parser.add_argument("--pr", type=str, required=True, help="PR# to cherry-pick") - parser.add_argument( - "--branch", - "-b", - type=str, - required=True, - help="target branch name for cherry-pick", - ) - parser.add_argument( - "--repo", - "-r", - type=str, - required=True, - help="path to full repository", - metavar="PATH", - ) - args = parser.parse_args() - - cp = CherryPick( - args.token, "ClickHouse", "ClickHouse", "core", args.pr, args.branch - ) - cp.execute(args.repo) diff --git a/utils/github/local.py b/utils/github/local.py deleted file mode 100644 index 571c9102ba0..00000000000 --- a/utils/github/local.py +++ /dev/null @@ -1,108 +0,0 @@ -# -*- coding: utf-8 -*- - -import functools -import logging -import os -import re - - -class RepositoryBase: - def __init__(self, repo_path): - import git - - self._repo = git.Repo(repo_path, search_parent_directories=(not repo_path)) - - # comparator of commits - def cmp(x, y): - if str(x) == str(y): - return 0 - if self._repo.is_ancestor(x, y): - return -1 - else: - return 1 - - self.comparator = functools.cmp_to_key(cmp) - - def get_head_commit(self): - return self._repo.commit(self._default) - - def iterate(self, begin, end): - rev_range = "{}...{}".format(begin, end) - for commit in self._repo.iter_commits(rev_range, first_parent=True): - yield commit - - -class Repository(RepositoryBase): - def __init__(self, repo_path, remote_name, default_branch_name): - super(Repository, self).__init__(repo_path) - self._remote = self._repo.remotes[remote_name] - self._remote.fetch() - self._default = self._remote.refs[default_branch_name] - - def get_release_branches(self): - """ - Returns sorted list of tuples: - * remote branch (git.refs.remote.RemoteReference), - * base commit (git.Commit), - * head (git.Commit)). - List is sorted by commits in ascending order. - """ - release_branches = [] - - RE_RELEASE_BRANCH_REF = re.compile(r"^refs/remotes/.+/\d+\.\d+$") - - for branch in [ - r for r in self._remote.refs if RE_RELEASE_BRANCH_REF.match(r.path) - ]: - base = self._repo.merge_base(self._default, self._repo.commit(branch)) - if not base: - logging.info( - "Branch %s is not based on branch %s. Ignoring.", - branch.path, - self._default, - ) - elif len(base) > 1: - logging.info( - "Branch %s has more than one base commit. Ignoring.", branch.path - ) - else: - release_branches.append((os.path.basename(branch.name), base[0])) - - return sorted(release_branches, key=lambda x: self.comparator(x[1])) - - -class BareRepository(RepositoryBase): - def __init__(self, repo_path, default_branch_name): - super(BareRepository, self).__init__(repo_path) - self._default = self._repo.branches[default_branch_name] - - def get_release_branches(self): - """ - Returns sorted list of tuples: - * branch (git.refs.head?), - * base commit (git.Commit), - * head (git.Commit)). - List is sorted by commits in ascending order. - """ - release_branches = [] - - RE_RELEASE_BRANCH_REF = re.compile(r"^refs/heads/\d+\.\d+$") - - for branch in [ - r for r in self._repo.branches if RE_RELEASE_BRANCH_REF.match(r.path) - ]: - base = self._repo.merge_base(self._default, self._repo.commit(branch)) - if not base: - logging.info( - "Branch %s is not based on branch %s. Ignoring.", - branch.path, - self._default, - ) - elif len(base) > 1: - logging.info( - "Branch %s has more than one base commit. Ignoring.", branch.path - ) - else: - release_branches.append((os.path.basename(branch.name), base[0])) - - return sorted(release_branches, key=lambda x: self.comparator(x[1])) diff --git a/utils/github/parser.py b/utils/github/parser.py deleted file mode 100644 index d8348e6d964..00000000000 --- a/utils/github/parser.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- - - -class Description: - """Parsed description representation""" - - MAP_CATEGORY_TO_LABEL = { - "New Feature": "pr-feature", - "Bug Fix": "pr-bugfix", - "Improvement": "pr-improvement", - "Performance Improvement": "pr-performance", - # 'Backward Incompatible Change': doesn't match anything - "Build/Testing/Packaging Improvement": "pr-build", - "Non-significant (changelog entry is not needed)": "pr-non-significant", - "Non-significant (changelog entry is not required)": "pr-non-significant", - "Non-significant": "pr-non-significant", - "Documentation (changelog entry is not required)": "pr-documentation", - # 'Other': doesn't match anything - } - - def __init__(self, pull_request): - self.label_name = str() - self.legal = False - - self._parse(pull_request["bodyText"]) - - def _parse(self, text): - lines = text.splitlines() - next_category = False - category = str() - - for line in lines: - stripped = line.strip() - - if not stripped: - continue - - if next_category: - category = stripped - next_category = False - - if ( - stripped - == "I hereby agree to the terms of the CLA available at: https://yandex.ru/legal/cla/?lang=en" - ): - self.legal = True - - category_headers = ( - "Category (leave one):", - "Changelog category (leave one):", - "Changelog category:", - "Category:", - ) - - if stripped in category_headers: - next_category = True - - if category in Description.MAP_CATEGORY_TO_LABEL: - self.label_name = Description.MAP_CATEGORY_TO_LABEL[category] - else: - if not category: - print("Cannot find category in pr description") - else: - print(("Unknown category: " + category)) diff --git a/utils/github/query.py b/utils/github/query.py deleted file mode 100644 index 7afbc57781c..00000000000 --- a/utils/github/query.py +++ /dev/null @@ -1,492 +0,0 @@ -# -*- coding: utf-8 -*- - -import requests - - -class Query: - """ - Implements queries to the Github API using GraphQL - """ - - _PULL_REQUEST = """ - author {{ - ... on User {{ - id - login - }} - }} - - baseRepository {{ - nameWithOwner - }} - - mergeCommit {{ - oid - parents(first: {min_page_size}) {{ - totalCount - nodes {{ - oid - }} - }} - }} - - mergedBy {{ - ... on User {{ - id - login - }} - }} - - baseRefName - closed - headRefName - id - mergeable - merged - number - title - url - """ - - def __init__(self, token, owner, name, team, max_page_size=100, min_page_size=10): - self._PULL_REQUEST = Query._PULL_REQUEST.format(min_page_size=min_page_size) - - self._token = token - self._owner = owner - self._name = name - self._team = team - - self._max_page_size = max_page_size - self._min_page_size = min_page_size - - self.api_costs = {} - - repo = self.get_repository() - self._id = repo["id"] - self.ssh_url = repo["sshUrl"] - self.default_branch = repo["defaultBranchRef"]["name"] - - self.members = set(self.get_members()) - - def get_repository(self): - _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - defaultBranchRef {{ - name - }} - id - sshUrl - }} - """ - - query = _QUERY.format(owner=self._owner, name=self._name) - return self._run(query)["repository"] - - def get_members(self): - """Get all team members for organization - - Returns: - members: a map of members' logins to ids - """ - - _QUERY = """ - organization(login: "{organization}") {{ - team(slug: "{team}") {{ - members(first: {max_page_size} {next}) {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - id - login - }} - }} - }} - }} - """ - - members = {} - not_end = True - query = _QUERY.format( - organization=self._owner, - team=self._team, - max_page_size=self._max_page_size, - next="", - ) - - while not_end: - result = self._run(query)["organization"]["team"] - if result is None: - break - result = result["members"] - not_end = result["pageInfo"]["hasNextPage"] - query = _QUERY.format( - organization=self._owner, - team=self._team, - max_page_size=self._max_page_size, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), - ) - - members += dict([(node["login"], node["id"]) for node in result["nodes"]]) - - return members - - def get_pull_request(self, number): - _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequest(number: {number}) {{ - {pull_request_data} - }} - }} - """ - - query = _QUERY.format( - owner=self._owner, - name=self._name, - number=number, - pull_request_data=self._PULL_REQUEST, - min_page_size=self._min_page_size, - ) - return self._run(query)["repository"]["pullRequest"] - - def find_pull_request(self, base, head): - _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequests(first: {min_page_size} baseRefName: "{base}" headRefName: "{head}") {{ - nodes {{ - {pull_request_data} - }} - totalCount - }} - }} - """ - - query = _QUERY.format( - owner=self._owner, - name=self._name, - base=base, - head=head, - pull_request_data=self._PULL_REQUEST, - min_page_size=self._min_page_size, - ) - result = self._run(query)["repository"]["pullRequests"] - if result["totalCount"] > 0: - return result["nodes"][0] - else: - return {} - - def find_pull_requests(self, label_name): - """ - Get all pull-requests filtered by label name - """ - _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequests(first: {min_page_size} labels: "{label_name}" states: OPEN) {{ - nodes {{ - {pull_request_data} - }} - }} - }} - """ - - query = _QUERY.format( - owner=self._owner, - name=self._name, - label_name=label_name, - pull_request_data=self._PULL_REQUEST, - min_page_size=self._min_page_size, - ) - return self._run(query)["repository"]["pullRequests"]["nodes"] - - def get_pull_requests(self, before_commit): - """ - Get all merged pull-requests from the HEAD of default branch to the last commit (excluding) - """ - - _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - defaultBranchRef {{ - target {{ - ... on Commit {{ - history(first: {max_page_size} {next}) {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - oid - associatedPullRequests(first: {min_page_size}) {{ - totalCount - nodes {{ - ... on PullRequest {{ - {pull_request_data} - - labels(first: {min_page_size}) {{ - totalCount - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - name - color - }} - }} - }} - }} - }} - }} - }} - }} - }} - }} - }} - """ - - pull_requests = [] - not_end = True - query = _QUERY.format( - owner=self._owner, - name=self._name, - max_page_size=self._max_page_size, - min_page_size=self._min_page_size, - pull_request_data=self._PULL_REQUEST, - next="", - ) - - while not_end: - result = self._run(query)["repository"]["defaultBranchRef"]["target"][ - "history" - ] - not_end = result["pageInfo"]["hasNextPage"] - query = _QUERY.format( - owner=self._owner, - name=self._name, - max_page_size=self._max_page_size, - min_page_size=self._min_page_size, - pull_request_data=self._PULL_REQUEST, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), - ) - - for commit in result["nodes"]: - # FIXME: maybe include `before_commit`? - if str(commit["oid"]) == str(before_commit): - not_end = False - break - - # TODO: fetch all pull-requests that were merged in a single commit. - assert ( - commit["associatedPullRequests"]["totalCount"] - <= self._min_page_size - ) - - for pull_request in commit["associatedPullRequests"]["nodes"]: - if ( - pull_request["baseRepository"]["nameWithOwner"] - == "{}/{}".format(self._owner, self._name) - and pull_request["baseRefName"] == self.default_branch - and pull_request["mergeCommit"]["oid"] == commit["oid"] - ): - pull_requests.append(pull_request) - - return pull_requests - - def create_pull_request( - self, source, target, title, description="", draft=False, can_modify=True - ): - _QUERY = """ - createPullRequest(input: {{ - baseRefName: "{target}", - headRefName: "{source}", - repositoryId: "{id}", - title: "{title}", - body: "{body}", - draft: {draft}, - maintainerCanModify: {modify} - }}) {{ - pullRequest {{ - {pull_request_data} - }} - }} - """ - - query = _QUERY.format( - target=target, - source=source, - id=self._id, - title=title, - body=description, - draft="true" if draft else "false", - modify="true" if can_modify else "false", - pull_request_data=self._PULL_REQUEST, - ) - return self._run(query, is_mutation=True)["createPullRequest"]["pullRequest"] - - def merge_pull_request(self, id): - _QUERY = """ - mergePullRequest(input: {{ - pullRequestId: "{id}" - }}) {{ - pullRequest {{ - {pull_request_data} - }} - }} - """ - - query = _QUERY.format(id=id, pull_request_data=self._PULL_REQUEST) - return self._run(query, is_mutation=True)["mergePullRequest"]["pullRequest"] - - # FIXME: figure out how to add more assignees at once - def add_assignee(self, pr, assignee): - _QUERY = """ - addAssigneesToAssignable(input: {{ - assignableId: "{id1}", - assigneeIds: "{id2}" - }}) {{ - clientMutationId - }} - """ - - query = _QUERY.format(id1=pr["id"], id2=assignee["id"]) - self._run(query, is_mutation=True) - - def set_label(self, pull_request, label_name): - """ - Set label by name to the pull request - - Args: - pull_request: JSON object returned by `get_pull_requests()` - label_name (string): label name - """ - - _GET_LABEL = """ - repository(owner: "{owner}" name: "{name}") {{ - labels(first: {max_page_size} {next} query: "{label_name}") {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - id - name - color - }} - }} - }} - """ - - _SET_LABEL = """ - addLabelsToLabelable(input: {{ - labelableId: "{pr_id}", - labelIds: "{label_id}" - }}) {{ - clientMutationId - }} - """ - - labels = [] - not_end = True - query = _GET_LABEL.format( - owner=self._owner, - name=self._name, - label_name=label_name, - max_page_size=self._max_page_size, - next="", - ) - - while not_end: - result = self._run(query)["repository"]["labels"] - not_end = result["pageInfo"]["hasNextPage"] - query = _GET_LABEL.format( - owner=self._owner, - name=self._name, - label_name=label_name, - max_page_size=self._max_page_size, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), - ) - - labels += [label for label in result["nodes"]] - - if not labels: - return - - query = _SET_LABEL.format(pr_id=pull_request["id"], label_id=labels[0]["id"]) - self._run(query, is_mutation=True) - - def _run(self, query, is_mutation=False): - from requests.adapters import HTTPAdapter - from urllib3.util.retry import Retry - - def requests_retry_session( - retries=3, - backoff_factor=0.3, - status_forcelist=(500, 502, 504), - session=None, - ): - session = session or requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount("http://", adapter) - session.mount("https://", adapter) - return session - - headers = {"Authorization": "bearer {}".format(self._token)} - if is_mutation: - query = """ - mutation {{ - {query} - }} - """.format( - query=query - ) - else: - query = """ - query {{ - {query} - rateLimit {{ - cost - remaining - }} - }} - """.format( - query=query - ) - - while True: - request = requests_retry_session().post( - "https://api.github.com/graphql", json={"query": query}, headers=headers - ) - if request.status_code == 200: - result = request.json() - if "errors" in result: - raise Exception( - "Errors occurred: {}\nOriginal query: {}".format( - result["errors"], query - ) - ) - - if not is_mutation: - import inspect - - caller = inspect.getouterframes(inspect.currentframe(), 2)[1][3] - if caller not in list(self.api_costs.keys()): - self.api_costs[caller] = 0 - self.api_costs[caller] += result["data"]["rateLimit"]["cost"] - - return result["data"] - else: - import json - - raise Exception( - "Query failed with code {code}:\n{json}".format( - code=request.status_code, - json=json.dumps(request.json(), indent=4), - ) - ) From bf591b971a6d9dbd1b63357ac353066d6348fc34 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 15:53:08 +0200 Subject: [PATCH 33/58] Moar categories --- tests/ci/run_check.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 74e60582e8d..608a708b04b 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -89,6 +89,10 @@ MAP_CATEGORY_TO_LABEL = { "Performance Improvement": "pr-performance", "Backward Incompatible Change": "pr-backward-incompatible", "Build/Testing/Packaging Improvement": "pr-build", + "Build Improvement": "pr-build", + "Build/Testing Improvement": "pr-build", + "Build": "pr-build", + "Packaging Improvement": "pr-build", "Not for changelog (changelog entry is not required)": "pr-not-for-changelog", "Not for changelog": "pr-not-for-changelog", "Documentation (changelog entry is not required)": "pr-documentation", From 693b47cdc9c39ee4d51202592cabc915ed9b0962 Mon Sep 17 00:00:00 2001 From: tavplubix Date: Tue, 29 Mar 2022 17:51:56 +0300 Subject: [PATCH 34/58] Revert "Fix enable LLVM for JIT compilation in CMake" --- contrib/llvm-cmake/CMakeLists.txt | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/contrib/llvm-cmake/CMakeLists.txt b/contrib/llvm-cmake/CMakeLists.txt index 87c8a65510f..6ff07f0e016 100644 --- a/contrib/llvm-cmake/CMakeLists.txt +++ b/contrib/llvm-cmake/CMakeLists.txt @@ -1,9 +1,12 @@ -if (APPLE OR NOT ARCH_AMD64 OR SANITIZE STREQUAL "undefined") - set (ENABLE_EMBEDDED_COMPILER_DEFAULT OFF) +# During cross-compilation in our CI we have to use llvm-tblgen and other building tools +# tools to be build for host architecture and everything else for target architecture (e.g. AArch64) +# Possible workaround is to use llvm-tblgen from some package... +# But lets just enable LLVM for native builds +if (CMAKE_CROSSCOMPILING OR SANITIZE STREQUAL "undefined") + set (ENABLE_EMBEDDED_COMPILER_DEFAULT OFF) else() - set (ENABLE_EMBEDDED_COMPILER_DEFAULT ON) + set (ENABLE_EMBEDDED_COMPILER_DEFAULT ON) endif() - option (ENABLE_EMBEDDED_COMPILER "Enable support for 'compile_expressions' option for query execution" ${ENABLE_EMBEDDED_COMPILER_DEFAULT}) if (NOT ENABLE_EMBEDDED_COMPILER) From 27390799c186c0f2b69d882a983d4f36bf9945a5 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 18:23:18 +0200 Subject: [PATCH 35/58] Fix black --- tests/ci/docker_images_check.py | 1 + tests/integration/test_allowed_url_from_config/test.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index ef6d369bf4f..4277ef1e15f 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -465,5 +465,6 @@ def main(): if status == "error": sys.exit(1) + if __name__ == "__main__": main() diff --git a/tests/integration/test_allowed_url_from_config/test.py b/tests/integration/test_allowed_url_from_config/test.py index 13f3929902d..01a2a500ebf 100644 --- a/tests/integration/test_allowed_url_from_config/test.py +++ b/tests/integration/test_allowed_url_from_config/test.py @@ -280,4 +280,4 @@ def test_HDFS(start_cluster): def test_schema_inference(start_cluster): error = node7.query_and_get_error("desc url('http://test.com`, 'TSVRaw'')") - assert(error.find('ReadWriteBufferFromHTTPBase') == -1) + assert error.find("ReadWriteBufferFromHTTPBase") == -1 From d59941e4f6ccd623b1f938460e6a712dd69a73d0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 19:15:25 +0200 Subject: [PATCH 36/58] Style --- tests/ci/commit_status_helper.py | 1 + tests/integration/test_allowed_url_from_config/test.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 53dfb145b44..4c79d5ace07 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -60,6 +60,7 @@ def post_commit_status_to_file(file_path, description, state, report_url): out = csv.writer(f, delimiter="\t") out.writerow([state, report_url, description]) + def post_label(gh, pr_info, label_name): repo = gh.get_repo(GITHUB_REPOSITORY) pull_request = repo.get_pull(pr_info.number) diff --git a/tests/integration/test_allowed_url_from_config/test.py b/tests/integration/test_allowed_url_from_config/test.py index 13f3929902d..01a2a500ebf 100644 --- a/tests/integration/test_allowed_url_from_config/test.py +++ b/tests/integration/test_allowed_url_from_config/test.py @@ -280,4 +280,4 @@ def test_HDFS(start_cluster): def test_schema_inference(start_cluster): error = node7.query_and_get_error("desc url('http://test.com`, 'TSVRaw'')") - assert(error.find('ReadWriteBufferFromHTTPBase') == -1) + assert error.find("ReadWriteBufferFromHTTPBase") == -1 From 9220bedb7dc57a1f0ee2be1999610fad75ea674a Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 19:28:18 +0200 Subject: [PATCH 37/58] Add submodule changed --- tests/ci/commit_status_helper.py | 5 +++-- tests/ci/pr_info.py | 9 +++++++++ tests/ci/run_check.py | 12 +++++++++--- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 4c79d5ace07..b4a3e49372f 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -61,7 +61,8 @@ def post_commit_status_to_file(file_path, description, state, report_url): out.writerow([state, report_url, description]) -def post_label(gh, pr_info, label_name): +def post_labels(gh, pr_info, labels_names): repo = gh.get_repo(GITHUB_REPOSITORY) pull_request = repo.get_pull(pr_info.number) - pull_request.add_to_labels(label_name) + for label in labels_names: + pull_request.add_to_labels(label) diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index ee4399792ae..0de4ec89124 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -236,6 +236,15 @@ class PRInfo: return True return False + def has_changes_in_submodules(self): + if self.changed_files is None or not self.changed_files: + return True + + for f in self.changed_files: + if "contrib" in f: + return True + return False + def can_skip_builds_and_use_version_from_master(self): # TODO: See a broken loop if "force tests" in self.labels: diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 608a708b04b..f0100082c71 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -8,7 +8,7 @@ from github import Github from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL from pr_info import PRInfo from get_robot_token import get_best_robot_token -from commit_status_helper import get_commit, post_label +from commit_status_helper import get_commit, post_labels NAME = "Run Check (actions)" @@ -22,6 +22,7 @@ OK_SKIP_LABELS = {"release", "pr-backport", "pr-cherrypick"} CAN_BE_TESTED_LABEL = "can be tested" DO_NOT_TEST_LABEL = "do not test" FORCE_TESTS_LABEL = "force tests" +SUBMODULE_CHANGED_LABEL = "submodule changed" # Individual trusted contirbutors who are not in any trusted organization. # Can be changed in runtime: we will append users that we learned to be in @@ -227,15 +228,20 @@ def check_pr_description(pr_info): if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - pr_info = PRInfo(need_orgs=True, pr_event_from_api=True) + pr_info = PRInfo(need_orgs=True, pr_event_from_api=True, need_changed_files=True) can_run, description, labels_state = should_run_checks_for_pr(pr_info) gh = Github(get_best_robot_token()) commit = get_commit(gh, pr_info.sha) description_report, category = check_pr_description(pr_info) + pr_labels = [] if category in MAP_CATEGORY_TO_LABEL: - post_label(gh, pr_info, MAP_CATEGORY_TO_LABEL[category]) + pr_labels.append(MAP_CATEGORY_TO_LABEL[category]) + if pr_info.has_changes_in_submodules(): + pr_labels.append(SUBMODULE_CHANGED_LABEL) + + post_labels(gh, pr_info, pr_labels) if description_report: print("::notice ::Cannot run, description does not match the template") logging.info( From 999558f00fa01605c42eacd2f85243cf74291e4c Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 19:28:45 +0200 Subject: [PATCH 38/58] Try fake update submodule --- contrib/NuRaft | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index 1707a7572aa..7c6c6961347 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 1707a7572aa66ec5d0a2dbe2bf5effa3352e6b2d +Subproject commit 7c6c696134706eeef274787c9282e40416dc03e8 From 1a8b2c9637c53bef291c0ab33866f84ef437f2e4 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 19:50:06 +0200 Subject: [PATCH 39/58] Remove submodule changed --- tests/ci/commit_status_helper.py | 7 +++++++ tests/ci/run_check.py | 12 +++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index b4a3e49372f..a53ce6715d5 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -61,6 +61,13 @@ def post_commit_status_to_file(file_path, description, state, report_url): out.writerow([state, report_url, description]) +def remove_labels(gh, pr_info, labels_names): + repo = gh.get_repo(GITHUB_REPOSITORY) + pull_request = repo.get_pull(pr_info.number) + for label in labels_names: + pull_request.remove_from_labels(label) + + def post_labels(gh, pr_info, labels_names): repo = gh.get_repo(GITHUB_REPOSITORY) pull_request = repo.get_pull(pr_info.number) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index f0100082c71..0138808c6ae 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -8,7 +8,7 @@ from github import Github from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL from pr_info import PRInfo from get_robot_token import get_best_robot_token -from commit_status_helper import get_commit, post_labels +from commit_status_helper import get_commit, post_labels, remove_labels NAME = "Run Check (actions)" @@ -235,13 +235,19 @@ if __name__ == "__main__": description_report, category = check_pr_description(pr_info) pr_labels = [] - if category in MAP_CATEGORY_TO_LABEL: + if ( + category in MAP_CATEGORY_TO_LABEL + and MAP_CATEGORY_TO_LABEL[category] not in pr_info.labels + ): pr_labels.append(MAP_CATEGORY_TO_LABEL[category]) if pr_info.has_changes_in_submodules(): pr_labels.append(SUBMODULE_CHANGED_LABEL) + elif SUBMODULE_CHANGED_LABEL in pr_info.labels: + remove_labels(gh, pr_info, [SUBMODULE_CHANGED_LABEL]) - post_labels(gh, pr_info, pr_labels) + if pr_labels: + post_labels(gh, pr_info, pr_labels) if description_report: print("::notice ::Cannot run, description does not match the template") logging.info( From 2e4ab305dd2a6951ad045b6cbdb920730710ddff Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 19:50:13 +0200 Subject: [PATCH 40/58] Revert "Try fake update submodule" This reverts commit 999558f00fa01605c42eacd2f85243cf74291e4c. --- contrib/NuRaft | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index 7c6c6961347..1707a7572aa 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 7c6c696134706eeef274787c9282e40416dc03e8 +Subproject commit 1707a7572aa66ec5d0a2dbe2bf5effa3352e6b2d From 74cafe154a4e707d7f21ed7c39aa8c0e04e08927 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 29 Mar 2022 19:58:59 +0200 Subject: [PATCH 41/58] fix flaky test 02246_async_insert_quota --- tests/queries/0_stateless/02246_async_insert_quota.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02246_async_insert_quota.sh b/tests/queries/0_stateless/02246_async_insert_quota.sh index 9fc4df4660c..4da93f94f19 100755 --- a/tests/queries/0_stateless/02246_async_insert_quota.sh +++ b/tests/queries/0_stateless/02246_async_insert_quota.sh @@ -16,7 +16,7 @@ ${CLICKHOUSE_CLIENT} -q "CREATE ROLE r02246" ${CLICKHOUSE_CLIENT} -q "CREATE USER u02246" ${CLICKHOUSE_CLIENT} -q "GRANT INSERT ON async_inserts_02246 TO r02246" ${CLICKHOUSE_CLIENT} -q "GRANT r02246 to u02246" -${CLICKHOUSE_CLIENT} -q "CREATE QUOTA q02246 FOR INTERVAL 1 HOUR MAX QUERY INSERTS = 2 TO r02246" +${CLICKHOUSE_CLIENT} -q "CREATE QUOTA q02246 FOR INTERVAL 100 YEAR MAX QUERY INSERTS = 2 TO r02246" ${CLICKHOUSE_CLIENT} --user u02246 --async_insert 1 -q "INSERT INTO async_inserts_02246 VALUES (1, 'a')" ${CLICKHOUSE_CLIENT} --user u02246 --async_insert 1 -q "INSERT INTO async_inserts_02246 VALUES (2, 'b')" From 476c7f9d7e10e882a085a2b2a08cec43ed951718 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 29 Mar 2022 18:32:28 +0000 Subject: [PATCH 42/58] Update ci checks server. --- tests/ci/clickhouse_helper.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 7ccbcb4a47e..3731037bf4c 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -10,13 +10,13 @@ from get_robot_token import get_parameter_from_ssm class ClickHouseHelper: def __init__(self, url=None): if url is None: - self.url = get_parameter_from_ssm("clickhouse-test-stat-url2") - self.auth = { - "X-ClickHouse-User": get_parameter_from_ssm( - "clickhouse-test-stat-login2" - ), - "X-ClickHouse-Key": "", - } + self.url = get_parameter_from_ssm("clickhouse-test-stat-url") + + self.url = url + self.auth = { + 'X-ClickHouse-User': user if user is not None else get_parameter_from_ssm("clickhouse-test-stat-login"), + 'X-ClickHouse-Key': password if password is not None else get_parameter_from_ssm("clickhouse-test-stat-password") + } @staticmethod def _insert_json_str_info_impl(url, auth, db, table, json_str): From 3995a52da61cb727f9604b5d0114adc500d4bced Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 29 Mar 2022 18:36:36 +0000 Subject: [PATCH 43/58] Fix check. --- tests/ci/clickhouse_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 3731037bf4c..ea99b97c7d8 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -8,7 +8,7 @@ from get_robot_token import get_parameter_from_ssm class ClickHouseHelper: - def __init__(self, url=None): + def __init__(self, url=None, user=None, password=None): if url is None: self.url = get_parameter_from_ssm("clickhouse-test-stat-url") From bb82f77477038fa9623f40c21d81ed9899eff429 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 29 Mar 2022 18:46:48 +0000 Subject: [PATCH 44/58] Fix check. --- tests/ci/clickhouse_helper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index ea99b97c7d8..093d5541362 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -8,14 +8,14 @@ from get_robot_token import get_parameter_from_ssm class ClickHouseHelper: - def __init__(self, url=None, user=None, password=None): + def __init__(self, url=None): if url is None: - self.url = get_parameter_from_ssm("clickhouse-test-stat-url") + url = get_parameter_from_ssm("clickhouse-test-stat-url") self.url = url self.auth = { - 'X-ClickHouse-User': user if user is not None else get_parameter_from_ssm("clickhouse-test-stat-login"), - 'X-ClickHouse-Key': password if password is not None else get_parameter_from_ssm("clickhouse-test-stat-password") + 'X-ClickHouse-User': get_parameter_from_ssm("clickhouse-test-stat-login"), + 'X-ClickHouse-Key': get_parameter_from_ssm("clickhouse-test-stat-password") } @staticmethod From 081073763748106499d3279894f99705ab332700 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 29 Mar 2022 18:58:16 +0000 Subject: [PATCH 45/58] Fix style --- tests/ci/clickhouse_helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 093d5541362..6627a3974d1 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -14,8 +14,8 @@ class ClickHouseHelper: self.url = url self.auth = { - 'X-ClickHouse-User': get_parameter_from_ssm("clickhouse-test-stat-login"), - 'X-ClickHouse-Key': get_parameter_from_ssm("clickhouse-test-stat-password") + "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"), + "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password") } @staticmethod From 3849e63ab186d86a29ed0c77e1524800ff87bafd Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 29 Mar 2022 19:06:50 +0000 Subject: [PATCH 46/58] Change database --- docker/test/performance-comparison/compare.sh | 2 +- tests/ci/clickhouse_helper.py | 2 +- tests/ci/compatibility_check.py | 2 +- tests/ci/docker_images_check.py | 2 +- tests/ci/docker_manifests_merge.py | 2 +- tests/ci/docs_check.py | 2 +- tests/ci/fast_test_check.py | 2 +- tests/ci/functional_test_check.py | 2 +- tests/ci/integration_test_check.py | 2 +- tests/ci/keeper_jepsen_check.py | 2 +- tests/ci/split_build_smoke_check.py | 2 +- tests/ci/stress_check.py | 2 +- tests/ci/style_check.py | 2 +- tests/ci/unit_tests_check.py | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh index 54f71ce05bb..cdfa080a475 100755 --- a/docker/test/performance-comparison/compare.sh +++ b/docker/test/performance-comparison/compare.sh @@ -1378,7 +1378,7 @@ $REF_SHA $SHA_TO_TEST $(numactl --hardware | sed -n 's/^available:[[:space:]]\+/ EOF # Also insert some data about the check into the CI checks table. - "${client[@]}" --query "INSERT INTO "'"'"gh-data"'"'".checks FORMAT TSVWithNamesAndTypes" \ + "${client[@]}" --query "INSERT INTO "'"'"default"'"'".checks FORMAT TSVWithNamesAndTypes" \ < ci-checks.tsv set -x diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 6627a3974d1..218aaca8b91 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -179,7 +179,7 @@ def mark_flaky_tests(clickhouse_helper, check_name, test_results): check_name=check_name ) - tests_data = clickhouse_helper.select_json_each_row("gh-data", query) + tests_data = clickhouse_helper.select_json_each_row("default", query) master_failed_tests = {row["test_name"] for row in tests_data} logging.info("Found flaky tests: %s", ", ".join(master_failed_tests)) diff --git a/tests/ci/compatibility_check.py b/tests/ci/compatibility_check.py index d546fabf231..a6846257cce 100644 --- a/tests/ci/compatibility_check.py +++ b/tests/ci/compatibility_check.py @@ -197,4 +197,4 @@ if __name__ == "__main__": report_url, CHECK_NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index 3d0cc468aec..44c4623b469 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -459,7 +459,7 @@ def main(): NAME, ) ch_helper = ClickHouseHelper() - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) if __name__ == "__main__": diff --git a/tests/ci/docker_manifests_merge.py b/tests/ci/docker_manifests_merge.py index 8bd50819877..9371440346e 100644 --- a/tests/ci/docker_manifests_merge.py +++ b/tests/ci/docker_manifests_merge.py @@ -234,7 +234,7 @@ def main(): NAME, ) ch_helper = ClickHouseHelper() - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) if __name__ == "__main__": diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py index 58678b160a4..a7de66859e2 100644 --- a/tests/ci/docs_check.py +++ b/tests/ci/docs_check.py @@ -114,4 +114,4 @@ if __name__ == "__main__": report_url, NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py index 64e04594786..8c17c15b463 100644 --- a/tests/ci/fast_test_check.py +++ b/tests/ci/fast_test_check.py @@ -204,7 +204,7 @@ if __name__ == "__main__": report_url, NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) # Refuse other checks to run if fast test failed if state != "success": diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py index 52ec5a0f8e9..c5a44ba66d2 100644 --- a/tests/ci/functional_test_check.py +++ b/tests/ci/functional_test_check.py @@ -356,7 +356,7 @@ if __name__ == "__main__": report_url, check_name_with_group, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) if state != "success": if "force-tests" in pr_info.labels: diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py index 30009414d6e..636fbe664ce 100644 --- a/tests/ci/integration_test_check.py +++ b/tests/ci/integration_test_check.py @@ -279,4 +279,4 @@ if __name__ == "__main__": report_url, check_name_with_group, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/keeper_jepsen_check.py b/tests/ci/keeper_jepsen_check.py index 24d720e67ab..14c31927b75 100644 --- a/tests/ci/keeper_jepsen_check.py +++ b/tests/ci/keeper_jepsen_check.py @@ -271,5 +271,5 @@ if __name__ == "__main__": report_url, CHECK_NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) clear_autoscaling_group() diff --git a/tests/ci/split_build_smoke_check.py b/tests/ci/split_build_smoke_check.py index 41ba6c2fedb..f281bdc55a8 100644 --- a/tests/ci/split_build_smoke_check.py +++ b/tests/ci/split_build_smoke_check.py @@ -147,4 +147,4 @@ if __name__ == "__main__": report_url, CHECK_NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py index 32c181140e2..4ecdd249351 100644 --- a/tests/ci/stress_check.py +++ b/tests/ci/stress_check.py @@ -176,4 +176,4 @@ if __name__ == "__main__": report_url, check_name, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/style_check.py b/tests/ci/style_check.py index 1b3037217c8..4c210f9beab 100644 --- a/tests/ci/style_check.py +++ b/tests/ci/style_check.py @@ -117,4 +117,4 @@ if __name__ == "__main__": report_url, NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/unit_tests_check.py b/tests/ci/unit_tests_check.py index 84c4faa822d..6db160d9a54 100644 --- a/tests/ci/unit_tests_check.py +++ b/tests/ci/unit_tests_check.py @@ -173,4 +173,4 @@ if __name__ == "__main__": report_url, check_name, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) From 7d140c9a59c1760d7f89d6b1cbc3ef149f3324e5 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 29 Mar 2022 22:30:50 +0300 Subject: [PATCH 47/58] clickhouse-test: increase timeout for obtaining processlist Sometimes 30 seconds is not enough on CI [1]: 2022.03.29 04:46:09.805406 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} executeQuery: (from [::1]:33506) SELECT * FROM system.processes WHERE query NOT LIKE '%system.processes%' AND Settings['log_comment'] = '02016_order_by_with_fill_monotonic_functions_removal.sql' AND current_database = 'test_7fqp6w' 2022.03.29 04:46:09.866100 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} ContextAccess (default): Access granted: SELECT(is_initial_query, user, query_id, address, port, initial_user, initial_query_id, initial_address, initial_port, interface, os_user, client_hostname, client_name, client_revision, client_version_major, client_version_minor, client_version_patch, http_method, http_user_agent, http_referer, forwarded_for, quota_key, distributed_depth, elapsed, is_cancelled, read_rows, read_bytes, total_rows_approx, written_rows, written_bytes, memory_usage, peak_memory_usage, query, thread_ids, ProfileEvents, Settings, current_database) ON system.processes 2022.03.29 04:46:12.787395 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} InterpreterSelectQuery: FetchColumns -> Complete 2022.03.29 04:46:19.749163 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} ParallelFormattingOutputFormat: Parallel formatting is being used 2022.03.29 04:46:37.923282 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} executeQuery: Read 15 rows, 44.20 KiB in 28.117172383 sec., 0 rows/sec., 1.57 KiB/sec. 2022.03.29 04:46:40.020586 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} DynamicQueryHandler: Done processing query 2022.03.29 04:46:40.033535 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} MemoryTracker: Peak memory usage (for query): 4.00 MiB. [1]: https://s3.amazonaws.com/clickhouse-test-reports/32928/ddd5bebe555ce8feebcdd339e47fc45184c20dd1/stateless_tests__thread__actions__[1/3].html Signed-off-by: Azat Khuzhin --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 863e061085a..5f9ab01cdcc 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -116,7 +116,7 @@ def clickhouse_execute_http(base_args, query, timeout=30, settings=None, default def clickhouse_execute(base_args, query, timeout=30, settings=None): return clickhouse_execute_http(base_args, query, timeout, settings).strip() -def clickhouse_execute_json(base_args, query, timeout=30, settings=None): +def clickhouse_execute_json(base_args, query, timeout=60, settings=None): data = clickhouse_execute_http(base_args, query, timeout, settings, 'JSONEachRow') if not data: return None From b264bdcf5e65874e41457ff94d8d6fe853560c2a Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 20 Mar 2022 21:40:58 +0300 Subject: [PATCH 48/58] tests: wait for left queries in 00417_kill_query CI: https://s3.amazonaws.com/clickhouse-test-reports/0/c244ee7cbb61fea384679e18a577ff579060288b/stateless_tests__release__s3_storage__actions_/runlog.log Signed-off-by: Azat Khuzhin --- tests/queries/0_stateless/00417_kill_query.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/00417_kill_query.sh b/tests/queries/0_stateless/00417_kill_query.sh index 28f03fc9c37..e40eb5c3f80 100755 --- a/tests/queries/0_stateless/00417_kill_query.sh +++ b/tests/queries/0_stateless/00417_kill_query.sh @@ -21,3 +21,5 @@ $CLICKHOUSE_CLIENT -q "KILL QUERY WHERE 0 ASYNC" $CLICKHOUSE_CLIENT -q "KILL QUERY WHERE 0 FORMAT TabSeparated" $CLICKHOUSE_CLIENT -q "KILL QUERY WHERE 0 SYNC FORMAT TabSeparated" $CLICKHOUSE_CLIENT -q "KILL QUERY WHERE 1 TEST" &>/dev/null + +clickhouse_test_wait_queries 60 From 2a099d90fe5dcdd788123ab9e2e604d2dcb9d7ba Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 30 Mar 2022 10:45:13 +0300 Subject: [PATCH 49/58] tests: make 02152_http_external_tables_memory_tracking more stable After settings randomization, it can not reach max_untracked_memory, i.e. due to max_block_size=67990 for example (like in [1]). [1]: https://s3.amazonaws.com/clickhouse-test-reports/35683/c609426d091ea8b9f2d37f8d7fcbf9e759cc94e5/stateless_tests__release__actions_.html Fix this, by settings max_untracked_memory to 0, to track any allocations. Signed-off-by: Azat Khuzhin --- .../0_stateless/02152_http_external_tables_memory_tracking.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh index 44de0e15370..4132ac91ae4 100755 --- a/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh +++ b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh @@ -23,7 +23,7 @@ function run_and_check() echo "Checking $*" # Run query with external table (implicit StorageMemory user) - $CLICKHOUSE_CURL -sS -F "s=@$tmp_file;" "$CLICKHOUSE_URL&s_structure=key+Int&query=SELECT+count()+FROM+s&memory_profiler_sample_probability=1&query_id=$query_id&$*" -o /dev/null + $CLICKHOUSE_CURL -sS -F "s=@$tmp_file;" "$CLICKHOUSE_URL&s_structure=key+Int&query=SELECT+count()+FROM+s&memory_profiler_sample_probability=1&max_untracked_memory=0&query_id=$query_id&$*" -o /dev/null ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @- <<<'SYSTEM FLUSH LOGS' From 68ec0d92c0c5446d7a32156ae2940e78bedb582f Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 30 Mar 2022 10:58:34 +0200 Subject: [PATCH 50/58] Remove if category changed --- tests/ci/run_check.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 0138808c6ae..5f3792f3432 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -234,20 +234,29 @@ if __name__ == "__main__": commit = get_commit(gh, pr_info.sha) description_report, category = check_pr_description(pr_info) - pr_labels = [] + pr_labels_to_add = [] + pr_labels_to_remove = [] if ( category in MAP_CATEGORY_TO_LABEL and MAP_CATEGORY_TO_LABEL[category] not in pr_info.labels ): - pr_labels.append(MAP_CATEGORY_TO_LABEL[category]) + pr_labels_to_add.append(MAP_CATEGORY_TO_LABEL[category]) + + for label in pr_info.labels: + if label in MAP_CATEGORY_TO_LABEL and label not in pr_labels_to_add: + pr_labels_to_remove.append(label) if pr_info.has_changes_in_submodules(): - pr_labels.append(SUBMODULE_CHANGED_LABEL) + pr_labels_to_add.append(SUBMODULE_CHANGED_LABEL) elif SUBMODULE_CHANGED_LABEL in pr_info.labels: - remove_labels(gh, pr_info, [SUBMODULE_CHANGED_LABEL]) + pr_labels_to_remove.append(SUBMODULE_CHANGED_LABEL) + + if pr_labels_to_add: + post_labels(gh, pr_info, pr_labels_to_add) + + if pr_labels_to_remove: + remove_labels(gh, pr_info, pr_labels_to_remove) - if pr_labels: - post_labels(gh, pr_info, pr_labels) if description_report: print("::notice ::Cannot run, description does not match the template") logging.info( From bcf64a73d1c272dca22a1ca910df3153a6a66784 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 30 Mar 2022 11:00:45 +0200 Subject: [PATCH 51/58] Followup --- tests/ci/run_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 5f3792f3432..8f28287e88b 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -243,7 +243,7 @@ if __name__ == "__main__": pr_labels_to_add.append(MAP_CATEGORY_TO_LABEL[category]) for label in pr_info.labels: - if label in MAP_CATEGORY_TO_LABEL and label not in pr_labels_to_add: + if label in MAP_CATEGORY_TO_LABEL.values() and label not in pr_labels_to_add: pr_labels_to_remove.append(label) if pr_info.has_changes_in_submodules(): From 2a1b331c0de65c680285e777e9a90ad49f1d86f2 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 30 Mar 2022 11:15:54 +0200 Subject: [PATCH 52/58] Fix build check --- tests/ci/build_report_check.py | 6 +++++- tests/ci/docs_check.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index 50315ad3d78..4e0859ef865 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -233,7 +233,11 @@ if __name__ == "__main__": if ok_builds == 0 or some_builds_are_missing: summary_status = "error" - description = f"{ok_builds}/{total_builds} builds are OK" + addition = "" + if some_builds_are_missing: + addition = "(some builds are missing)" + + description = f"{ok_builds}/{total_builds} builds are OK {addition}" print("::notice ::Report url: {}".format(url)) diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py index 6dde0dfaea4..803fb3cff7d 100644 --- a/tests/ci/docs_check.py +++ b/tests/ci/docs_check.py @@ -114,7 +114,7 @@ if __name__ == "__main__": report_url, NAME, ) - ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) if status == "error": sys.exit(1) From c510ede8dd5b9e739be4de6b6c57aa8bbddfa22b Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 30 Mar 2022 11:19:11 +0200 Subject: [PATCH 53/58] Fixup --- tests/ci/run_check.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 8f28287e88b..93dc77124c2 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -243,7 +243,11 @@ if __name__ == "__main__": pr_labels_to_add.append(MAP_CATEGORY_TO_LABEL[category]) for label in pr_info.labels: - if label in MAP_CATEGORY_TO_LABEL.values() and label not in pr_labels_to_add: + if ( + label in MAP_CATEGORY_TO_LABEL.values() + and category in MAP_CATEGORY_TO_LABEL + and label != MAP_CATEGORY_TO_LABEL[category] + ): pr_labels_to_remove.append(label) if pr_info.has_changes_in_submodules(): From b19ecf60ad9bd518de109dcd536b45ac716faaaf Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 30 Mar 2022 11:30:21 +0200 Subject: [PATCH 54/58] Fix style --- tests/ci/clickhouse_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 218aaca8b91..d52b6262a78 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -15,7 +15,7 @@ class ClickHouseHelper: self.url = url self.auth = { "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"), - "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password") + "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password"), } @staticmethod From d3697625a93a32f34b6955e1763e081909f71584 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 30 Mar 2022 10:55:49 +0000 Subject: [PATCH 55/58] Fix stylecheck --- tests/ci/clickhouse_helper.py | 2 +- utils/tests-visualizer/index.html | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 218aaca8b91..d52b6262a78 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -15,7 +15,7 @@ class ClickHouseHelper: self.url = url self.auth = { "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"), - "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password") + "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password"), } @staticmethod diff --git a/utils/tests-visualizer/index.html b/utils/tests-visualizer/index.html index 13f8daaa151..00076f683fa 100644 --- a/utils/tests-visualizer/index.html +++ b/utils/tests-visualizer/index.html @@ -84,14 +84,14 @@ let render_data_query = ` SELECT groupArray([d, n, fail]) FROM ( SELECT n, check_start_time::Date - start_date AS d, max(test_status LIKE 'F%' OR test_status LIKE 'E%') AS fail - FROM "gh-data".checks + FROM "default".checks INNER JOIN ( SELECT test_name, toUInt16(rowNumberInAllBlocks()) AS n FROM ( SELECT DISTINCT test_name - FROM "gh-data".checks + FROM "default".checks WHERE match(test_name, '^\\d+_') AND check_name ILIKE '%stateless%' AND check_start_time > now() - INTERVAL 1 DAY ORDER BY test_name ) @@ -112,7 +112,7 @@ let test_names_query = ` SELECT test_name, toUInt16(rowNumberInAllBlocks()) AS n FROM ( SELECT DISTINCT test_name - FROM "gh-data".checks + FROM "default".checks WHERE match(test_name, '^\\d+_') AND check_name ILIKE '%stateless%' AND check_start_time > now() - INTERVAL 1 DAY ORDER BY test_name ) FORMAT JSONCompact`; From 8980994faacbc806a9ed1bd3a76c7321132fd270 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 30 Mar 2022 16:29:13 +0200 Subject: [PATCH 56/58] Resurrect build hash --- tests/ci/build_check.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 24af9e5194c..eb2aa616383 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -54,6 +54,7 @@ def get_packager_cmd( build_version: str, image_version: str, ccache_path: str, + official: bool, ) -> str: package_type = build_config["package_type"] comp = build_config["compiler"] @@ -83,6 +84,9 @@ def get_packager_cmd( if _can_export_binaries(build_config): cmd += " --with-binaries=tests" + if official: + cmd += " --official" + return cmd @@ -254,9 +258,11 @@ def main(): logging.info("Got version from repo %s", version.string) + official_flag = pr_info.number == 0 version_type = "testing" if "release" in pr_info.labels or "release-lts" in pr_info.labels: version_type = "stable" + official_flag = True update_version_local(REPO_COPY, version, version_type) @@ -290,7 +296,9 @@ def main(): version.string, image_version, ccache_path, + official=official_flag ) + logging.info("Going to run packager with %s", packager_cmd) build_clickhouse_log = os.path.join(TEMP_PATH, "build_log") From c34d5125864a1e33ab846bcf41e986357dc01a3a Mon Sep 17 00:00:00 2001 From: tavplubix Date: Wed, 30 Mar 2022 18:36:37 +0300 Subject: [PATCH 57/58] Update 00484_preferred_max_column_in_block_size_bytes.sql --- .../00484_preferred_max_column_in_block_size_bytes.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/00484_preferred_max_column_in_block_size_bytes.sql b/tests/queries/0_stateless/00484_preferred_max_column_in_block_size_bytes.sql index dc021ad52db..470bca70e06 100644 --- a/tests/queries/0_stateless/00484_preferred_max_column_in_block_size_bytes.sql +++ b/tests/queries/0_stateless/00484_preferred_max_column_in_block_size_bytes.sql @@ -1,3 +1,5 @@ +-- Tags: no-random-settings + drop table if exists tab_00484; create table tab_00484 (date Date, x UInt64, s FixedString(128)) engine = MergeTree PARTITION BY date ORDER BY (date, x) SETTINGS min_bytes_for_wide_part = 0; insert into tab_00484 select today(), number, toFixedString('', 128) from system.numbers limit 8192; From fcd4e0be654159e34f61d7c4f95f123ba9b6c67c Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 30 Mar 2022 19:15:27 +0200 Subject: [PATCH 58/58] Black --- tests/ci/build_check.py | 2 +- tests/ci/clickhouse_helper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index eb2aa616383..2a079a60367 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -296,7 +296,7 @@ def main(): version.string, image_version, ccache_path, - official=official_flag + official=official_flag, ) logging.info("Going to run packager with %s", packager_cmd) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 218aaca8b91..d52b6262a78 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -15,7 +15,7 @@ class ClickHouseHelper: self.url = url self.auth = { "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"), - "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password") + "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password"), } @staticmethod