From aa092eeffb8f09d9d65294bfa2a62cacc258e562 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 27 Dec 2021 16:42:06 +0300 Subject: [PATCH 01/41] proper handle of 'max_rows_to_read' in case of reading in order of sorting key and limit --- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 14 ++++++++++-- .../MergeTree/MergeTreeSelectProcessor.cpp | 5 ----- ...5_read_in_order_max_rows_to_read.reference | 6 +++++ .../02155_read_in_order_max_rows_to_read.sql | 22 +++++++++++++++++++ 4 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference create mode 100644 tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index cdedd37e14a..07ac6f5764b 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -875,12 +875,22 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd { std::atomic total_rows{0}; + /// Do not check number of read rows if we have reading + /// in order of sorting key with limit. + /// In general case, when there exists WHERE clause + /// it's impossible to estimate number of rows precisely, + /// because we can stop reading at any time. + SizeLimits limits; - if (settings.read_overflow_mode == OverflowMode::THROW && settings.max_rows_to_read) + if (settings.read_overflow_mode == OverflowMode::THROW + && settings.max_rows_to_read + && !query_info.input_order_info) limits = SizeLimits(settings.max_rows_to_read, 0, settings.read_overflow_mode); SizeLimits leaf_limits; - if (settings.read_overflow_mode_leaf == OverflowMode::THROW && settings.max_rows_to_read_leaf) + if (settings.read_overflow_mode_leaf == OverflowMode::THROW + && settings.max_rows_to_read_leaf + && !query_info.input_order_info) leaf_limits = SizeLimits(settings.max_rows_to_read_leaf, 0, settings.read_overflow_mode_leaf); auto mark_cache = context->getIndexMarkCache(); diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 2d4d3617cee..332eb27094a 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -37,11 +37,6 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( has_limit_below_one_block(has_limit_below_one_block_), total_rows(data_part->index_granularity.getRowsCountInRanges(all_mark_ranges)) { - /// Actually it means that parallel reading from replicas enabled - /// and we have to collaborate with initiator. - /// In this case we won't set approximate rows, because it will be accounted multiple times - if (!extension_.has_value()) - addTotalRowsApprox(total_rows); ordered_names = header_without_virtual_columns.getNames(); } diff --git a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference new file mode 100644 index 00000000000..b73ab43cabb --- /dev/null +++ b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference @@ -0,0 +1,6 @@ +10 +0 +1 +2 +3 +4 diff --git a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql new file mode 100644 index 00000000000..e82c78b5e42 --- /dev/null +++ b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql @@ -0,0 +1,22 @@ +DROP TABLE IF EXISTS t_max_rows_to_read; + +CREATE TABLE t_max_rows_to_read (a UInt64) +ENGINE = MergeTree ORDER BY a +SETTINGS index_granularity = 4; + +INSERT INTO t_max_rows_to_read SELECT number FROM numbers(100); + +SET max_threads = 1; + +SELECT a FROM t_max_rows_to_read WHERE a = 10 SETTINGS max_rows_to_read = 4; + +SELECT a FROM t_max_rows_to_read ORDER BY a LIMIT 5 SETTINGS max_rows_to_read = 12; + +-- This should work, but actually it doesn't. Need to investigate. +-- SELECT a FROM t_max_rows_to_read WHERE a > 10 ORDER BY a LIMIT 5 SETTINGS max_rows_to_read = 20; + +SELECT a FROM t_max_rows_to_read ORDER BY a LIMIT 20 FORMAT Null SETTINGS max_rows_to_read = 12; -- { serverError 158 } +SELECT a FROM t_max_rows_to_read WHERE a > 10 ORDER BY a LIMIT 5 FORMAT Null SETTINGS max_rows_to_read = 12; -- { serverError 158 } +SELECT a FROM t_max_rows_to_read WHERE a = 10 OR a = 20 FORMAT Null SETTINGS max_rows_to_read = 4; -- { serverError 158 } + +DROP TABLE t_max_rows_to_read; From 04e6e6ab5e4818f8774ecf3878ca2205d8e1aae6 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 21 Mar 2022 14:52:26 +0000 Subject: [PATCH 02/41] Add ParallelReadBuffer for S3 --- src/Common/RangeGenerator.h | 46 ++++++++++++++++++++ src/IO/ReadBufferFromS3.h | 73 ++++++++++++++++++++++++++++++++ src/IO/ReadWriteBufferFromHTTP.h | 38 +---------------- src/IO/S3Common.cpp | 23 ++++++++++ src/IO/S3Common.h | 2 + src/Storages/StorageS3.cpp | 8 +++- 6 files changed, 152 insertions(+), 38 deletions(-) create mode 100644 src/Common/RangeGenerator.h diff --git a/src/Common/RangeGenerator.h b/src/Common/RangeGenerator.h new file mode 100644 index 00000000000..8d3b88936fc --- /dev/null +++ b/src/Common/RangeGenerator.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class RangeGenerator +{ +public: + explicit RangeGenerator(size_t total_size_, size_t range_step_, size_t range_start = 0) + : from(range_start), range_step(range_step_), total_size(total_size_) + { + } + + size_t totalRanges() const { return static_cast(round(static_cast(total_size - from) / range_step)); } + + using Range = std::pair; + + // return upper exclusive range of values, i.e. [from_range, to_range> + std::optional nextRange() + { + if (from >= total_size) + { + return std::nullopt; + } + + auto to = from + range_step; + if (to >= total_size) + { + to = total_size; + } + + Range range{from, to}; + from = to; + return std::move(range); + } + +private: + size_t from; + size_t range_step; + size_t total_size; +}; + +} diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 157b6d46b6d..6668f570cb6 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -1,5 +1,6 @@ #pragma once +#include #include #if USE_AWS_S3 @@ -10,6 +11,7 @@ #include #include #include +#include #include @@ -62,6 +64,12 @@ public: size_t getFileOffsetOfBufferEnd() const override { return offset; } + void setRange(off_t begin, off_t end) + { + offset = begin; + read_until_position = end; + } + private: std::unique_ptr initialize(); @@ -76,6 +84,71 @@ private: bool restricted_seek; }; +/// Creates separate ReadBufferFromS3 for sequence of ranges of particular object +class ReadBufferS3Factory : public ParallelReadBuffer::ReadBufferFactory +{ +public: + explicit ReadBufferS3Factory( + std::shared_ptr client_ptr_, + const String & bucket_, + const String & key_, + size_t range_step_, + size_t object_size_, + UInt64 s3_max_single_read_retries_, + const ReadSettings &read_settings) + : client_ptr(client_ptr_) + , bucket(bucket_) + , key(key_) + , settings(read_settings) + , range_generator(object_size_, range_step_) + , range_step(range_step_) + , object_size(object_size_) + , s3_max_single_read_retries(s3_max_single_read_retries_) + { + assert(range_step > 0); + assert(range_step < object_size); + } + + size_t totalRanges() const + { + return static_cast(round(static_cast(object_size) / range_step)); + } + + SeekableReadBufferPtr getReader() override + { + const auto next_range = range_generator.nextRange(); + if (!next_range) + { + return nullptr; + } + + auto reader = std::make_shared( + client_ptr, bucket, key, s3_max_single_read_retries, settings); + reader->setRange(next_range->first, next_range->second - 1); + return reader; + } + + off_t seek(off_t off, [[maybe_unused]] int whence) override + { + range_generator = RangeGenerator{object_size, range_step, static_cast(off)}; + return off; + } + + std::optional getTotalSize() override { return object_size; } + +private: + std::shared_ptr client_ptr; + const String bucket; + const String key; + ReadSettings settings; + + RangeGenerator range_generator; + size_t range_step; + size_t object_size; + + UInt64 s3_max_single_read_retries; +}; + } #endif diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h index 061dd772212..f0c10759142 100644 --- a/src/IO/ReadWriteBufferFromHTTP.h +++ b/src/IO/ReadWriteBufferFromHTTP.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -635,43 +636,6 @@ public: void buildNewSession(const Poco::URI & uri) override { session = makeHTTPSession(uri, timeouts); } }; -class RangeGenerator -{ -public: - explicit RangeGenerator(size_t total_size_, size_t range_step_, size_t range_start = 0) - : from(range_start), range_step(range_step_), total_size(total_size_) - { - } - - size_t totalRanges() const { return static_cast(round(static_cast(total_size - from) / range_step)); } - - using Range = std::pair; - - // return upper exclusive range of values, i.e. [from_range, to_range> - std::optional nextRange() - { - if (from >= total_size) - { - return std::nullopt; - } - - auto to = from + range_step; - if (to >= total_size) - { - to = total_size; - } - - Range range{from, to}; - from = to; - return std::move(range); - } - -private: - size_t from; - size_t range_step; - size_t total_size; -}; - class ReadWriteBufferFromHTTP : public detail::ReadWriteBufferFromHTTPBase> { using Parent = detail::ReadWriteBufferFromHTTPBase>; diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index e63e6fde1f4..971ac0d8a73 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -33,6 +33,8 @@ # include # include +#include // Y_IGNORE +#include // Y_IGNORE namespace { @@ -682,6 +684,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int S3_ERROR; } namespace S3 @@ -839,6 +842,26 @@ namespace S3 throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI: {}{}", quoteString(bucket), !uri.empty() ? " (" + uri.toString() + ")" : ""); } + + size_t getObjectSize(std::shared_ptr client_ptr, const String & bucket, const String & key, bool throw_on_error) + { + Aws::S3::Model::HeadObjectRequest req; + req.SetBucket(bucket); + req.SetKey(key); + + Aws::S3::Model::HeadObjectOutcome outcome = client_ptr->HeadObject(req); + + if (outcome.IsSuccess()) + { + auto read_result = outcome.GetResultWithOwnership(); + return static_cast(read_result.GetContentLength()); + } + else if (throw_on_error) + { + throw DB::Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR); + } + return 0; + } } } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 97cb4f74f90..c33ce427e66 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -75,6 +75,8 @@ struct URI static void validateBucket(const String & bucket, const Poco::URI & uri); }; +size_t getObjectSize(std::shared_ptr client_ptr, const String & bucket, const String & key, bool throw_on_error = true); + } #endif diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index f319bd1097b..6f5228c2c63 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -1,4 +1,6 @@ #include +#include "IO/ParallelReadBuffer.h" +#include "IO/IOThreadPool.h" #include "Parsers/ASTCreateQuery.h" #if USE_AWS_S3 @@ -273,9 +275,13 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; + size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false); + auto factory = std::make_unique(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings()); + read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()), + std::make_unique(std::move(factory), &IOThreadPool::get(), 4), chooseCompressionMethod(current_key, compression_hint)); + auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); From 6785ad165a72e372f155a10e71a872e37bdfc2c1 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 22 Mar 2022 14:06:30 +0000 Subject: [PATCH 03/41] Fix issue for mutliple download threads --- src/IO/ParallelReadBuffer.cpp | 2 +- src/IO/ParallelReadBuffer.h | 4 ++-- src/IO/ReadBufferFromS3.h | 4 ++-- src/Storages/StorageS3.cpp | 20 +++++++++++++++----- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/IO/ParallelReadBuffer.cpp b/src/IO/ParallelReadBuffer.cpp index 7fa10c160ad..3ee62148c23 100644 --- a/src/IO/ParallelReadBuffer.cpp +++ b/src/IO/ParallelReadBuffer.cpp @@ -237,7 +237,7 @@ void ParallelReadBuffer::readerThreadFunction(ReadWorkerPtr read_worker) while (!emergency_stop && !read_worker->cancel) { if (!read_worker->reader->next()) - throw Exception("Failed to read all the data from the reader", ErrorCodes::LOGICAL_ERROR); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to read all the data from the reader, missing {} bytes", read_worker->bytes_left); if (emergency_stop || read_worker->cancel) break; diff --git a/src/IO/ParallelReadBuffer.h b/src/IO/ParallelReadBuffer.h index 7b364205e8e..7a3f0cb3773 100644 --- a/src/IO/ParallelReadBuffer.h +++ b/src/IO/ParallelReadBuffer.h @@ -82,8 +82,8 @@ public: std::unique_ptr reader_factory_, ThreadPool * pool, size_t max_working_readers, - WorkerSetup worker_setup = {}, - WorkerCleanup worker_cleanup = {}); + WorkerSetup worker_setup = [](ThreadStatus &){}, + WorkerCleanup worker_cleanup = [](ThreadStatus &){}); ~ParallelReadBuffer() override { finishAndWait(); } diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 6668f570cb6..e3708c4b748 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -60,7 +60,7 @@ public: void setReadUntilPosition(size_t position) override; - Range getRemainingReadRange() const override { return Range{ .left = static_cast(offset), .right = read_until_position }; } + Range getRemainingReadRange() const override { return Range{ .left = static_cast(offset), .right = read_until_position - 1}; } size_t getFileOffsetOfBufferEnd() const override { return offset; } @@ -124,7 +124,7 @@ public: auto reader = std::make_shared( client_ptr, bucket, key, s3_max_single_read_retries, settings); - reader->setRange(next_range->first, next_range->second - 1); + reader->setRange(next_range->first, next_range->second); return reader; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 6f5228c2c63..9dc15e54ecc 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -275,12 +275,22 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; - size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false); - auto factory = std::make_unique(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings()); + const auto max_download_threads = getContext()->getSettings().max_download_threads; + if (max_download_threads == 1) + { + read_buf = wrapReadBufferWithCompressionMethod( + std::make_unique(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()), + chooseCompressionMethod(current_key, compression_hint)); + } + else + { + size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false); + auto factory = std::make_unique(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings()); - read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(std::move(factory), &IOThreadPool::get(), 4), - chooseCompressionMethod(current_key, compression_hint)); + read_buf = wrapReadBufferWithCompressionMethod( + std::make_unique(std::move(factory), &IOThreadPool::get(), getContext()->getSettings().max_download_threads), + chooseCompressionMethod(current_key, compression_hint)); + } auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; From f693eba568415e66d856da7c45df824d97949361 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 22 Mar 2022 14:30:40 +0000 Subject: [PATCH 04/41] fix tests with approx rows --- src/Processors/QueryPlan/ReadFromMergeTree.cpp | 9 ++++++--- src/Storages/MergeTree/MergeTreeIOSettings.h | 2 ++ src/Storages/MergeTree/MergeTreeSelectProcessor.cpp | 8 ++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 1bfc1ec7306..cf7b2dcc855 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -45,7 +45,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -static MergeTreeReaderSettings getMergeTreeReaderSettings(const ContextPtr & context) +static MergeTreeReaderSettings getMergeTreeReaderSettings( + const ContextPtr & context, const SelectQueryInfo & query_info) { const auto & settings = context->getSettingsRef(); return @@ -53,6 +54,7 @@ static MergeTreeReaderSettings getMergeTreeReaderSettings(const ContextPtr & con .read_settings = context->getReadSettings(), .save_marks_in_cache = true, .checksum_on_read = settings.checksum_on_read, + .read_in_order = query_info.input_order_info != nullptr, }; } @@ -82,7 +84,7 @@ ReadFromMergeTree::ReadFromMergeTree( getPrewhereInfo(query_info_), data_.getPartitionValueType(), virt_column_names_)}) - , reader_settings(getMergeTreeReaderSettings(context_)) + , reader_settings(getMergeTreeReaderSettings(context_, query_info_)) , prepared_parts(std::move(parts_)) , real_column_names(std::move(real_column_names_)) , virt_column_names(std::move(virt_column_names_)) @@ -203,6 +205,7 @@ ProcessorPtr ReadFromMergeTree::createSource( .colums_to_read = required_columns }; } + return std::make_shared( data, storage_snapshot, part.data_part, max_block_size, preferred_block_size_bytes, preferred_max_column_in_block_size_bytes, required_columns, part.ranges, use_uncompressed_cache, prewhere_info, @@ -918,7 +921,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead( total_marks_pk += part->index_granularity.getMarksCountWithoutFinal(); parts_before_pk = parts.size(); - auto reader_settings = getMergeTreeReaderSettings(context); + auto reader_settings = getMergeTreeReaderSettings(context, query_info); bool use_skip_indexes = settings.use_skip_indexes; if (select.final() && !settings.use_skip_indexes_if_final) diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h index aaa8fae7dba..4fe424c98ff 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.h +++ b/src/Storages/MergeTree/MergeTreeIOSettings.h @@ -20,6 +20,8 @@ struct MergeTreeReaderSettings bool save_marks_in_cache = false; /// Validate checksums on reading (should be always enabled in production). bool checksum_on_read = true; + /// True if we read in order of sorting key. + bool read_in_order = false; }; struct MergeTreeWriterSettings diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 90e272e329e..f6cbf54b752 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -37,6 +37,14 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( has_limit_below_one_block(has_limit_below_one_block_), total_rows(data_part->index_granularity.getRowsCountInRanges(all_mark_ranges)) { + /// Actually it means that parallel reading from replicas enabled + /// and we have to collaborate with initiator. + /// In this case we won't set approximate rows, because it will be accounted multiple times. + /// Also do not count amount of read rows if we read in order of sorting key, + /// because we don't know actual amount of read rows in case when limit is set. + if (!extension_.has_value() && !reader_settings.read_in_order) + addTotalRowsApprox(total_rows); + ordered_names = header_without_virtual_columns.getNames(); } From f32ef2a556f4b6a667ec2a0ff3d3553703046dc0 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 23 Mar 2022 08:15:18 +0000 Subject: [PATCH 05/41] Small polishing for S3 reader --- src/IO/ReadBufferFromS3.cpp | 4 +++- src/IO/ReadBufferFromS3.h | 48 +++++++++++++++++-------------------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 93bbe02c9cd..05e1276bc63 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -42,6 +42,7 @@ ReadBufferFromS3::ReadBufferFromS3( UInt64 max_single_read_retries_, const ReadSettings & settings_, bool use_external_buffer_, + size_t offset_, size_t read_until_position_, bool restricted_seek_) : SeekableReadBufferWithSize(nullptr, 0) @@ -49,9 +50,10 @@ ReadBufferFromS3::ReadBufferFromS3( , bucket(bucket_) , key(key_) , max_single_read_retries(max_single_read_retries_) + , offset(offset_) + , read_until_position(read_until_position_) , read_settings(settings_) , use_external_buffer(use_external_buffer_) - , read_until_position(read_until_position_) , restricted_seek(restricted_seek_) { } diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index e3708c4b748..8d71a03fb87 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -5,15 +5,15 @@ #if USE_AWS_S3 -#include +# include -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include -#include +# include namespace Aws::S3 { @@ -32,7 +32,9 @@ private: String bucket; String key; UInt64 max_single_read_retries; + off_t offset = 0; + off_t read_until_position = 0; Aws::S3::Model::GetObjectResult read_result; std::unique_ptr impl; @@ -47,6 +49,7 @@ public: UInt64 max_single_read_retries_, const ReadSettings & settings_, bool use_external_buffer = false, + size_t offset_ = 0, size_t read_until_position_ = 0, bool restricted_seek_ = false); @@ -60,16 +63,10 @@ public: void setReadUntilPosition(size_t position) override; - Range getRemainingReadRange() const override { return Range{ .left = static_cast(offset), .right = read_until_position - 1}; } + Range getRemainingReadRange() const override { return Range{.left = static_cast(offset), .right = read_until_position - 1}; } size_t getFileOffsetOfBufferEnd() const override { return offset; } - void setRange(off_t begin, off_t end) - { - offset = begin; - read_until_position = end; - } - private: std::unique_ptr initialize(); @@ -77,8 +74,6 @@ private: bool use_external_buffer; - off_t read_until_position = 0; - /// There is different seek policy for disk seek and for non-disk seek /// (non-disk seek is applied for seekable input formats: orc, arrow, parquet). bool restricted_seek; @@ -95,11 +90,11 @@ public: size_t range_step_, size_t object_size_, UInt64 s3_max_single_read_retries_, - const ReadSettings &read_settings) + const ReadSettings & read_settings_) : client_ptr(client_ptr_) , bucket(bucket_) , key(key_) - , settings(read_settings) + , read_settings(read_settings_) , range_generator(object_size_, range_step_) , range_step(range_step_) , object_size(object_size_) @@ -109,11 +104,6 @@ public: assert(range_step < object_size); } - size_t totalRanges() const - { - return static_cast(round(static_cast(object_size) / range_step)); - } - SeekableReadBufferPtr getReader() override { const auto next_range = range_generator.nextRange(); @@ -123,8 +113,14 @@ public: } auto reader = std::make_shared( - client_ptr, bucket, key, s3_max_single_read_retries, settings); - reader->setRange(next_range->first, next_range->second); + client_ptr, + bucket, + key, + s3_max_single_read_retries, + read_settings, + false /*use_external_buffer*/, + next_range->first, + next_range->second + 1); return reader; } @@ -140,7 +136,7 @@ private: std::shared_ptr client_ptr; const String bucket; const String key; - ReadSettings settings; + ReadSettings read_settings; RangeGenerator range_generator; size_t range_step; From 131b3a091c60a13ed7cd7ddfe090e5dda204b6ad Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 23 Mar 2022 08:40:00 +0000 Subject: [PATCH 06/41] Refactor StorageS3 --- src/IO/ReadBufferFromS3.h | 2 +- src/Storages/StorageS3.cpp | 70 ++++++++++++++++++++++++++------------ src/Storages/StorageS3.h | 6 +++- 3 files changed, 54 insertions(+), 24 deletions(-) diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 8d71a03fb87..d984fbebe5f 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -120,7 +120,7 @@ public: read_settings, false /*use_external_buffer*/, next_range->first, - next_range->second + 1); + next_range->second); return reader; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 9dc15e54ecc..6966d356070 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -238,7 +238,8 @@ StorageS3Source::StorageS3Source( String compression_hint_, const std::shared_ptr & client_, const String & bucket_, - std::shared_ptr file_iterator_) + std::shared_ptr file_iterator_, + const size_t download_thread_num_) : SourceWithProgress(getHeader(sample_block_, need_path, need_file)) , WithContext(context_) , name(std::move(name_)) @@ -254,6 +255,7 @@ StorageS3Source::StorageS3Source( , with_file_column(need_file) , with_path_column(need_path) , file_iterator(file_iterator_) + , download_thread_num(download_thread_num_) { initialize(); } @@ -275,22 +277,7 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; - const auto max_download_threads = getContext()->getSettings().max_download_threads; - if (max_download_threads == 1) - { - read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()), - chooseCompressionMethod(current_key, compression_hint)); - } - else - { - size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false); - auto factory = std::make_unique(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings()); - - read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(std::move(factory), &IOThreadPool::get(), getContext()->getSettings().max_download_threads), - chooseCompressionMethod(current_key, compression_hint)); - } + read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint)); auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; @@ -298,10 +285,9 @@ bool StorageS3Source::initialize() if (columns_desc.hasDefaults()) { - builder.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, columns_desc, *input_format, getContext()); - }); + builder.addSimpleTransform( + [&](const Block & header) + { return std::make_shared(header, columns_desc, *input_format, getContext()); }); } pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); @@ -311,6 +297,43 @@ bool StorageS3Source::initialize() return true; } +std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & key) +{ + const size_t object_size = DB::S3::getObjectSize(client, bucket, key, false); + + auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; + const bool use_parallel_download = download_buffer_size > 0 && download_thread_num > 1; + const bool object_too_small = object_size < download_thread_num * download_buffer_size; + if (!use_parallel_download || object_too_small) + { + LOG_TRACE(&Poco::Logger::get("StorageS3Source"), "Downloading object of size {} from S3 in single thread", object_size); + return std::make_unique(client, bucket, key, max_single_read_retries, getContext()->getReadSettings()); + } + + assert(object_size > 0); + + if (download_buffer_size < DBMS_DEFAULT_BUFFER_SIZE) + { + LOG_WARNING( + &Poco::Logger::get("StorageS3Source"), + "Downloading buffer {} bytes too small, set at least {} bytes", + download_buffer_size, + DBMS_DEFAULT_BUFFER_SIZE); + download_buffer_size = DBMS_DEFAULT_BUFFER_SIZE; + } + + auto factory = std::make_unique( + client, bucket, key, download_buffer_size, object_size, max_single_read_retries, getContext()->getReadSettings()); + LOG_TRACE( + &Poco::Logger::get("StorageS3Source"), + "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", + download_thread_num, + object_size, + download_buffer_size); + + return std::make_unique(std::move(factory), &IOThreadPool::get(), download_thread_num); +} + String StorageS3Source::getName() const { return name; @@ -674,6 +697,8 @@ Pipe StorageS3::read( block_for_format = storage_snapshot->metadata->getSampleBlock(); } + const size_t max_download_threads = local_context->getSettingsRef().max_download_threads; + const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams); for (size_t i = 0; i < num_streams; ++i) { pipes.emplace_back(std::make_shared( @@ -690,7 +715,8 @@ Pipe StorageS3::read( compression_method, client_auth.client, client_auth.uri.bucket, - iterator_wrapper)); + iterator_wrapper, + download_threads_per_stream)); } auto pipe = Pipe::unitePipes(std::move(pipes)); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 300b7becb93..6c88f5bba56 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -74,7 +74,8 @@ public: String compression_hint_, const std::shared_ptr & client_, const String & bucket, - std::shared_ptr file_iterator_); + std::shared_ptr file_iterator_, + size_t download_thread_num); String getName() const override; @@ -105,9 +106,12 @@ private: bool with_file_column = false; bool with_path_column = false; std::shared_ptr file_iterator; + size_t download_thread_num = 1; /// Recreate ReadBuffer and BlockInputStream for each file. bool initialize(); + + std::unique_ptr createS3ReadBuffer(const String & key); }; /** From 93ad209910cb00c492bd727de0e5eed3baf2e55d Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 23 Mar 2022 11:52:31 +0000 Subject: [PATCH 07/41] Attach memory tracker --- src/Storages/StorageS3.cpp | 32 ++++++++++++++++++++++++++++---- src/Storages/StorageS3.h | 1 - 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 6966d356070..35a8405eeb2 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -293,7 +293,6 @@ bool StorageS3Source::initialize() pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); reader = std::make_unique(*pipeline); - initialized = false; return true; } @@ -331,7 +330,32 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k object_size, download_buffer_size); - return std::make_unique(std::move(factory), &IOThreadPool::get(), download_thread_num); + ThreadGroupStatusPtr running_group = CurrentThread::isInitialized() && CurrentThread::get().getThreadGroup() + ? CurrentThread::get().getThreadGroup() + : MainThreadStatus::getInstance().getThreadGroup(); + + ContextPtr query_context + = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr; + + auto worker_cleanup = [has_running_group = running_group == nullptr](ThreadStatus & thread_status) + { + if (has_running_group) + thread_status.detachQuery(false); + }; + + auto worker_setup = [query_context = std::move(query_context), + running_group = std::move(running_group)](ThreadStatus & thread_status) + { + /// Save query context if any, because cache implementation needs it. + if (query_context) + thread_status.attachQueryContext(query_context); + + /// To be able to pass ProfileEvents. + if (running_group) + thread_status.attachQuery(running_group); + }; + + return std::make_unique(std::move(factory), &IOThreadPool::get(), download_thread_num, worker_setup, worker_cleanup); } String StorageS3Source::getName() const @@ -698,7 +722,7 @@ Pipe StorageS3::read( } const size_t max_download_threads = local_context->getSettingsRef().max_download_threads; - const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams); + // const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams); for (size_t i = 0; i < num_streams; ++i) { pipes.emplace_back(std::make_shared( @@ -716,7 +740,7 @@ Pipe StorageS3::read( client_auth.client, client_auth.uri.bucket, iterator_wrapper, - download_threads_per_stream)); + max_download_threads)); } auto pipe = Pipe::unitePipes(std::move(pipes)); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 6c88f5bba56..6362e777df6 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -102,7 +102,6 @@ private: std::unique_ptr reader; /// onCancel and generate can be called concurrently std::mutex reader_mutex; - bool initialized = false; bool with_file_column = false; bool with_path_column = false; std::shared_ptr file_iterator; From 1a5f5c32bb2e2a96d22c0ef8132d29fc21db5aba Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 23 Mar 2022 14:36:25 +0000 Subject: [PATCH 08/41] Remove commented out code --- src/Storages/StorageS3.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 35a8405eeb2..b80b9ead53f 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -722,7 +722,6 @@ Pipe StorageS3::read( } const size_t max_download_threads = local_context->getSettingsRef().max_download_threads; - // const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams); for (size_t i = 0; i < num_streams; ++i) { pipes.emplace_back(std::make_shared( From e0d7b6dc3ed239ef4075add09090751d45941a4e Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 24 Mar 2022 09:30:06 +0000 Subject: [PATCH 09/41] Add tests for S3 multithreaded download --- tests/integration/test_storage_s3/test.py | 34 ++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index dd29d0a5d6a..7364af4d2dd 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -535,8 +535,9 @@ def test_multipart_put(started_cluster, maybe_auth, positive): one_line_length = 6 # 3 digits, 2 commas, 1 line separator. + total_rows = csv_size_bytes // one_line_length # Generate data having size more than one part - int_data = [[1, 2, 3] for i in range(csv_size_bytes // one_line_length)] + int_data = [[1, 2, 3] for i in range(total_rows)] csv_data = "".join(["{},{},{}\n".format(x, y, z) for x, y, z in int_data]) assert len(csv_data) > min_part_size_bytes @@ -573,6 +574,37 @@ def test_multipart_put(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) + # select uploaded data from many threads + select_query = ( + "select sum(column1), sum(column2), sum(column3) " + "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( + host=started_cluster.minio_redirect_host, + port=started_cluster.minio_redirect_port, + bucket=bucket, + filename=filename, + auth=maybe_auth, + table_format=table_format, + ) + ) + try: + select_result = run_query( + instance, + select_query, + settings={ + "max_download_threads": random.randint(4, 16), + "max_download_buffer_size": 1024 * 1024, + }, + ) + except helpers.client.QueryRuntimeException: + if positive: + raise + else: + assert positive + assert ( + select_result + == "\t".join(map(str, [total_rows, total_rows * 2, total_rows * 3])) + "\n" + ) + def test_remote_host_filter(started_cluster): instance = started_cluster.instances["restricted_dummy"] From 62a495a5fc64c24b83982a362c8e72b965f2fb09 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 24 Mar 2022 09:38:11 +0000 Subject: [PATCH 10/41] Small refactor --- src/IO/ReadBufferFromS3.cpp | 30 ++++++++++++++++ src/IO/ReadBufferFromS3.h | 42 ++++++----------------- src/IO/S3Common.cpp | 3 +- src/Storages/StorageS3.cpp | 3 +- tests/integration/test_storage_s3/test.py | 2 +- 5 files changed, 44 insertions(+), 36 deletions(-) diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 05e1276bc63..c78f41a0f9d 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -274,6 +274,36 @@ std::unique_ptr ReadBufferFromS3::initialize() throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR); } +SeekableReadBufferPtr ReadBufferS3Factory::getReader() +{ + const auto next_range = range_generator.nextRange(); + if (!next_range) + { + return nullptr; + } + + auto reader = std::make_shared( + client_ptr, + bucket, + key, + s3_max_single_read_retries, + read_settings, + false /*use_external_buffer*/, + next_range->first, + next_range->second); + return reader; +} + +off_t ReadBufferS3Factory::seek(off_t off, [[maybe_unused]] int whence) +{ + range_generator = RangeGenerator{object_size, range_step, static_cast(off)}; + return off; +} + +std::optional ReadBufferS3Factory::getTotalSize() +{ + return object_size; +} } #endif diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index d984fbebe5f..dbba751b321 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -5,15 +5,15 @@ #if USE_AWS_S3 -# include +#include -# include -# include -# include -# include -# include +#include +#include +#include +#include +#include -# include +#include namespace Aws::S3 { @@ -104,33 +104,11 @@ public: assert(range_step < object_size); } - SeekableReadBufferPtr getReader() override - { - const auto next_range = range_generator.nextRange(); - if (!next_range) - { - return nullptr; - } + SeekableReadBufferPtr getReader() override; - auto reader = std::make_shared( - client_ptr, - bucket, - key, - s3_max_single_read_retries, - read_settings, - false /*use_external_buffer*/, - next_range->first, - next_range->second); - return reader; - } + off_t seek(off_t off, [[maybe_unused]] int whence) override; - off_t seek(off_t off, [[maybe_unused]] int whence) override - { - range_generator = RangeGenerator{object_size, range_step, static_cast(off)}; - return off; - } - - std::optional getTotalSize() override { return object_size; } + std::optional getTotalSize() override; private: std::shared_ptr client_ptr; diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 971ac0d8a73..e706f0a75c1 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -24,6 +24,7 @@ # include # include # include +# include // Y_IGNORE # include # include @@ -33,8 +34,6 @@ # include # include -#include // Y_IGNORE -#include // Y_IGNORE namespace { diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index b80b9ead53f..5625ff16a10 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -355,7 +355,8 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k thread_status.attachQuery(running_group); }; - return std::make_unique(std::move(factory), &IOThreadPool::get(), download_thread_num, worker_setup, worker_cleanup); + return std::make_unique( + std::move(factory), &IOThreadPool::get(), download_thread_num, std::move(worker_setup), std::move(worker_cleanup)); } String StorageS3Source::getName() const diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 7364af4d2dd..7fc59d9d9ef 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -517,7 +517,7 @@ def test_put_get_with_globs(started_cluster): # ("'minio','minio123',",True), Redirect with credentials not working with nginx. ], ) -def test_multipart_put(started_cluster, maybe_auth, positive): +def test_multipart(started_cluster, maybe_auth, positive): # type: (ClickHouseCluster) -> None bucket = ( From 67195bfdd5f883e2e23e591bc78d2f7a56200c04 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Fri, 25 Mar 2022 21:00:00 +0000 Subject: [PATCH 11/41] support schema inference for type Object in format JSONEachRow --- src/Formats/JSONEachRowUtils.cpp | 20 +- src/Formats/ReadSchemaUtils.cpp | 40 ++++ src/Functions/FunctionsConversion.h | 173 +++++++++++++----- src/Interpreters/InterpreterInsertQuery.cpp | 1 + src/Interpreters/executeQuery.cpp | 16 +- src/Processors/Formats/ISchemaReader.h | 5 + .../01825_type_json_from_map.reference | 4 + .../0_stateless/01825_type_json_from_map.sql | 41 +++++ ...01825_type_json_schema_inference.reference | 8 + .../01825_type_json_schema_inference.sh | 52 ++++++ 10 files changed, 303 insertions(+), 57 deletions(-) create mode 100644 tests/queries/0_stateless/01825_type_json_from_map.reference create mode 100644 tests/queries/0_stateless/01825_type_json_from_map.sql create mode 100644 tests/queries/0_stateless/01825_type_json_schema_inference.reference create mode 100755 tests/queries/0_stateless/01825_type_json_schema_inference.sh diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index 66e0538fef1..80b9f84dafe 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -158,22 +159,29 @@ DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) { auto object = field.getObject(); DataTypePtr value_type; + bool is_map = true; for (const auto key_value_pair : object) { auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second); - if (!type) - return nullptr; + if (!type || isObject(type)) + { + is_map = false; + break; + } if (value_type && value_type->getName() != type->getName()) - return nullptr; + { + is_map = false; + break; + } value_type = type; } - if (!value_type) - return nullptr; + if (is_map && value_type) + return std::make_shared(std::make_shared(), value_type); - return std::make_shared(std::make_shared(), value_type); + return std::make_shared("json", false); } throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 559fac4cfaa..8e2531e2006 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include namespace DB { @@ -17,6 +19,28 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +static std::optional getOrderedColumnsList( + const NamesAndTypesList & columns_list, const Names & columns_order_hint) +{ + if (columns_list.size() != columns_order_hint.size()) + return {}; + + std::unordered_map available_columns; + for (const auto & [name, type] : columns_list) + available_columns.emplace(name, type); + + NamesAndTypesList res; + for (const auto & name : columns_order_hint) + { + auto it = available_columns.find(name); + if (it == available_columns.end()) + return {}; + + res.emplace_back(name, it->second); + } + return res; +} + ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, @@ -52,6 +76,22 @@ ColumnsDescription readSchemaFromFormat( { throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message()); } + + /// If we have "INSERT SELECT" query then try to order + /// columns as they are ordered in table schema for formats + /// without strict column order (like JSON and TSKV). + /// It will allow to execute simple data loading with query + /// "INSERT INTO table SELECT * FROM ..." + const auto & insertion_table = context->getInsertionTable(); + if (!schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty()) + { + auto storage = DatabaseCatalog::instance().getTable(insertion_table, context); + auto metadata = storage->getInMemoryMetadataPtr(); + auto names_in_storage = metadata->getColumns().getNamesOfPhysical(); + auto ordered_list = getOrderedColumnsList(names_and_types, names_in_storage); + if (ordered_list) + names_and_types = *ordered_list; + } } else throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} file format doesn't support schema inference", format_name); diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index e098378f51a..edddb200648 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -53,6 +53,7 @@ #include #include #include +#include namespace DB @@ -3140,52 +3141,138 @@ private: } } + WrapperType createTupleToObjectWrapper(const DataTypeTuple & from_tuple, bool has_nullable_subcolumns) const + { + if (!from_tuple.haveExplicitNames()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_tuple.getName()); + + PathsInData paths; + DataTypes from_types; + + std::tie(paths, from_types) = flattenTuple(from_tuple.getPtr()); + auto to_types = from_types; + + for (auto & type : to_types) + { + if (isTuple(type) || isNested(type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten Named Tuple. Got: {}", + from_tuple.getName()); + + type = recursiveRemoveLowCardinality(type); + } + + return [element_wrappers = getElementWrappers(from_types, to_types), + has_nullable_subcolumns, from_types, to_types, paths] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) + { + size_t tuple_size = to_types.size(); + auto flattened_column = flattenTuple(arguments.front().column); + const auto & column_tuple = assert_cast(*flattened_column); + + if (tuple_size != column_tuple.getColumns().size()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Expected tuple with {} subcolumn, but got {} subcolumns", + tuple_size, column_tuple.getColumns().size()); + + auto res = ColumnObject::create(has_nullable_subcolumns); + for (size_t i = 0; i < tuple_size; ++i) + { + ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }}; + auto converted_column = element_wrappers[i](element, to_types[i], nullable_source, input_rows_count); + res->addSubcolumn(paths[i], converted_column->assumeMutable()); + } + + return res; + }; + } + + WrapperType createMapToObjectWrapper(const DataTypeMap & from_map, bool has_nullable_subcolumns) const + { + auto key_value_types = from_map.getKeyValueTypes(); + + if (!isStringOrFixedString(key_value_types[0])) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object from Map can be performed only from Map " + "with String or FixedString key. Got: {}", from_map.getName()); + + const auto & value_type = key_value_types[1]; + auto to_value_type = value_type; + + if (!has_nullable_subcolumns && value_type->isNullable()) + to_value_type = removeNullable(value_type); + + if (has_nullable_subcolumns && !value_type->isNullable()) + to_value_type = makeNullable(value_type); + + DataTypes to_key_value_types{std::make_shared(), std::move(to_value_type)}; + auto element_wrappers = getElementWrappers(key_value_types, to_key_value_types); + + return [has_nullable_subcolumns, element_wrappers, key_value_types, to_key_value_types] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t) -> ColumnPtr + { + const auto & column_map = assert_cast(*arguments.front().column); + const auto & offsets = column_map.getNestedColumn().getOffsets(); + auto key_value_columns = column_map.getNestedData().getColumnsCopy(); + + for (size_t i = 0; i < 2; ++i) + { + ColumnsWithTypeAndName element{{key_value_columns[i], key_value_types[i], ""}}; + key_value_columns[i] = element_wrappers[i](element, to_key_value_types[i], nullable_source, key_value_columns[i]->size()); + } + + const auto & key_column_str = assert_cast(*key_value_columns[0]); + const auto & value_column = *key_value_columns[1]; + + using SubcolumnsMap = HashMap; + SubcolumnsMap subcolumns; + + for (size_t row = 0; row < offsets.size(); ++row) + { + for (size_t i = offsets[static_cast(row) - 1]; i < offsets[row]; ++i) + { + auto ref = key_column_str.getDataAt(i); + + bool inserted; + SubcolumnsMap::LookupResult it; + subcolumns.emplace(ref, it, inserted); + auto & subcolumn = it->getMapped(); + + if (inserted) + subcolumn = value_column.cloneEmpty()->cloneResized(row); + + /// Map can have duplicated keys. We insert only first one. + if (subcolumn->size() == row) + subcolumn->insertFrom(value_column, i); + } + + /// Insert default values for keys missed in current row. + for (const auto & [_, subcolumn] : subcolumns) + if (subcolumn->size() == row) + subcolumn->insertDefault(); + } + + auto column_object = ColumnObject::create(has_nullable_subcolumns); + for (auto && [key, subcolumn] : subcolumns) + { + PathInData path(key.toView()); + column_object->addSubcolumn(path, std::move(subcolumn)); + } + + return column_object; + }; + } + WrapperType createObjectWrapper(const DataTypePtr & from_type, const DataTypeObject * to_type) const { if (const auto * from_tuple = checkAndGetDataType(from_type.get())) { - if (!from_tuple->haveExplicitNames()) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_type->getName()); - - PathsInData paths; - DataTypes from_types; - - std::tie(paths, from_types) = flattenTuple(from_type); - auto to_types = from_types; - - for (auto & type : to_types) - { - if (isTuple(type) || isNested(type)) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_type->getName()); - - type = recursiveRemoveLowCardinality(type); - } - - return [element_wrappers = getElementWrappers(from_types, to_types), - has_nullable_subcolumns = to_type->hasNullableSubcolumns(), from_types, to_types, paths] - (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) - { - size_t tuple_size = to_types.size(); - auto flattened_column = flattenTuple(arguments.front().column); - const auto & column_tuple = assert_cast(*flattened_column); - - if (tuple_size != column_tuple.getColumns().size()) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Expected tuple with {} subcolumn, but got {} subcolumns", - tuple_size, column_tuple.getColumns().size()); - - auto res = ColumnObject::create(has_nullable_subcolumns); - for (size_t i = 0; i < tuple_size; ++i) - { - ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }}; - auto converted_column = element_wrappers[i](element, to_types[i], nullable_source, input_rows_count); - res->addSubcolumn(paths[i], converted_column->assumeMutable()); - } - - return res; - }; + return createTupleToObjectWrapper(*from_tuple, to_type->hasNullableSubcolumns()); + } + else if (const auto * from_map = checkAndGetDataType(from_type.get())) + { + return createMapToObjectWrapper(*from_map, to_type->hasNullableSubcolumns()); } else if (checkAndGetDataType(from_type.get())) { @@ -3199,7 +3286,7 @@ private: } throw Exception(ErrorCodes::TYPE_MISMATCH, - "Cast to Object can be performed only from flatten named tuple or string. Got: {}", from_type->getName()); + "Cast to Object can be performed only from flatten named Tuple, Map or String. Got: {}", from_type->getName()); } template diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index df44814a96e..ce369182766 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -358,6 +358,7 @@ BlockIO InterpreterInsertQuery::execute() auto new_context = Context::createCopy(context); new_context->setSettings(new_settings); + new_context->setInsertionTable(getContext()->getInsertionTable()); InterpreterSelectWithUnionQuery interpreter_select{ query.select, new_context, SelectQueryOptions(QueryProcessingStage::Complete, 1)}; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index c1606700540..89f5f4e6959 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -656,6 +656,14 @@ static std::tuple executeQueryImpl( limits.size_limits = SizeLimits(settings.max_result_rows, settings.max_result_bytes, settings.result_overflow_mode); } + if (const auto * insert_interpreter = typeid_cast(&*interpreter)) + { + /// Save insertion table (not table function). TODO: support remote() table function. + auto table_id = insert_interpreter->getDatabaseTable(); + if (!table_id.empty()) + context->setInsertionTable(std::move(table_id)); + } + { std::unique_ptr span; if (context->query_trace_context.trace_id != UUID()) @@ -666,14 +674,6 @@ static std::tuple executeQueryImpl( } res = interpreter->execute(); } - - if (const auto * insert_interpreter = typeid_cast(&*interpreter)) - { - /// Save insertion table (not table function). TODO: support remote() table function. - auto table_id = insert_interpreter->getDatabaseTable(); - if (!table_id.empty()) - context->setInsertionTable(std::move(table_id)); - } } if (process_list_entry) diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 2d35809e26a..36cf0656119 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -18,6 +18,10 @@ public: virtual NamesAndTypesList readSchema() = 0; + /// True if order of columns is important in format. + /// Exceptions: JSON, TSKV. + virtual bool hasStrictOrderOfColumns() const { return true; } + virtual ~ISchemaReader() = default; protected: @@ -60,6 +64,7 @@ class IRowWithNamesSchemaReader : public ISchemaReader public: IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); NamesAndTypesList readSchema() override; + bool hasStrictOrderOfColumns() const override { return false; } protected: /// Read one row and determine types of columns in it. diff --git a/tests/queries/0_stateless/01825_type_json_from_map.reference b/tests/queries/0_stateless/01825_type_json_from_map.reference new file mode 100644 index 00000000000..dbcf67faef3 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_from_map.reference @@ -0,0 +1,4 @@ +800000 2000000 1400000 900000 +800000 2000000 1400000 900000 +Tuple(col0 UInt64, col1 UInt64, col2 UInt64, col3 UInt64, col4 UInt64, col5 UInt64, col6 UInt64, col7 UInt64, col8 UInt64) +1600000 4000000 2800000 1800000 diff --git a/tests/queries/0_stateless/01825_type_json_from_map.sql b/tests/queries/0_stateless/01825_type_json_from_map.sql new file mode 100644 index 00000000000..2480aca1667 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_from_map.sql @@ -0,0 +1,41 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS t_json; +DROP TABLE IF EXISTS t_map; + +SET allow_experimental_object_type = 1; + +CREATE TABLE t_json(id UInt64, obj JSON) ENGINE = MergeTree ORDER BY id; +CREATE TABLE t_map(id UInt64, m Map(String, UInt64)) ENGINE = MergeTree ORDER BY id; + +INSERT INTO t_map +SELECT + number, + ( + arrayMap(x -> 'col' || toString(x), range(number % 10)), + range(number % 10) + )::Map(String, UInt64) +FROM numbers(1000000); + +INSERT INTO t_json SELECT id, m FROM t_map; +SELECT sum(m['col1']), sum(m['col4']), sum(m['col7']), sum(m['col8'] = 0) FROM t_map; +SELECT sum(obj.col1), sum(obj.col4), sum(obj.col7), sum(obj.col8 = 0) FROM t_json; +SELECT toTypeName(obj) FROM t_json LIMIT 1; + +INSERT INTO t_json +SELECT + number, + ( + arrayMap(x -> 'col' || toString(x), range(number % 10)), + range(number % 10) + )::Map(FixedString(4), UInt64) +FROM numbers(1000000); + +SELECT sum(obj.col1), sum(obj.col4), sum(obj.col7), sum(obj.col8 = 0) FROM t_json; + +INSERT INTO t_json +SELECT number, (range(number % 10), range(number % 10))::Map(UInt64, UInt64) +FROM numbers(1000000); -- { serverError 53 } + +DROP TABLE IF EXISTS t_json; +DROP TABLE IF EXISTS t_map; diff --git a/tests/queries/0_stateless/01825_type_json_schema_inference.reference b/tests/queries/0_stateless/01825_type_json_schema_inference.reference new file mode 100644 index 00000000000..c2c18b5a2ff --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_schema_inference.reference @@ -0,0 +1,8 @@ +{"id":"1","obj":{"k1":1,"k2":{"k3":"2","k4":[{"k5":3,"k6":0},{"k5":4,"k6":0}]},"some":0},"s":"foo"} +{"id":"2","obj":{"k1":0,"k2":{"k3":"str","k4":[{"k5":0,"k6":55}]},"some":42},"s":"bar"} +Tuple(k1 Int8, k2 Tuple(k3 String, k4 Nested(k5 Int8, k6 Int8)), some Int8) +{"id":"1","obj":"aaa","s":"foo"} +{"id":"2","obj":"bbb","s":"bar"} +{"map":{"k1":1,"k2":2},"obj":{"k1":1,"k2.k3":2},"map_type":"Map(String, Nullable(Float64))","obj_type":"Object('json')"} +{"obj":{"k1":1,"k2":2},"map":{"k1":"1","k2":"2"}} +Tuple(k1 Float64, k2 Float64) diff --git a/tests/queries/0_stateless/01825_type_json_schema_inference.sh b/tests/queries/0_stateless/01825_type_json_schema_inference.sh new file mode 100755 index 00000000000..4e8758761c5 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_schema_inference.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference" + +${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_json_inference (id UInt64, obj JSON, s String) \ + ENGINE = MergeTree ORDER BY id" --allow_experimental_object_type 1 + +user_files_path=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +mkdir -p ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/ +rm -rf ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME:?}/* + +filename="${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json" + +echo '{"id": 1, "obj": {"k1": 1, "k2": {"k3": 2, "k4": [{"k5": 3}, {"k5": 4}]}}, "s": "foo"}' > $filename +echo '{"id": 2, "obj": {"k2": {"k3": "str", "k4": [{"k6": 55}]}, "some": 42}, "s": "bar"}' >> $filename + +${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM t_json_inference FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 +${CLICKHOUSE_CLIENT} -q "SELECT toTypeName(obj) FROM t_json_inference LIMIT 1" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference" + +${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_json_inference (id UInt64, obj String, s String) ENGINE = MergeTree ORDER BY id" + +echo '{"obj": "aaa", "id": 1, "s": "foo"}' > $filename +echo '{"id": 2, "obj": "bbb", "s": "bar"}' >> $filename + +${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM t_json_inference FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference" + +echo '{"map": {"k1": 1, "k2": 2}, "obj": {"k1": 1, "k2": {"k3": 2}}}' > $filename + +${CLICKHOUSE_CLIENT} -q "SELECT map, obj, toTypeName(map) AS map_type, toTypeName(obj) AS obj_type \ + FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow') FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 + +${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_json_inference (obj JSON, map Map(String, UInt64)) \ + ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1 + +echo '{"map": {"k1": 1, "k2": 2}, "obj": {"k1": 1, "k2": 2}}' > $filename + +${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')" +${CLICKHOUSE_CLIENT} -q "SELECT * FROM t_json_inference FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 +${CLICKHOUSE_CLIENT} -q "SELECT toTypeName(obj) FROM t_json_inference LIMIT 1" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference" From a6450be8b625b26f188a0f176217226fe648de86 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Sat, 26 Mar 2022 01:11:16 +0000 Subject: [PATCH 12/41] fix schema inference --- src/Formats/JSONEachRowUtils.cpp | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index 80b9f84dafe..fb1ddb479f2 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -159,29 +159,37 @@ DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) { auto object = field.getObject(); DataTypePtr value_type; - bool is_map = true; + bool is_object = false; for (const auto key_value_pair : object) { auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second); - if (!type || isObject(type)) + if (!type) + continue; + + if (isObject(type)) { - is_map = false; + is_object = true; break; } - if (value_type && value_type->getName() != type->getName()) + if (!value_type) { - is_map = false; + value_type = type; + } + else if (!value_type->equals(*type)) + { + is_object = true; break; } - - value_type = type; } - if (is_map && value_type) + if (is_object) + return std::make_shared("json", false); + + if (value_type) return std::make_shared(std::make_shared(), value_type); - return std::make_shared("json", false); + return nullptr; } throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; From d9d826c813cafced31842be0ac3397b1c6292f60 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 28 Mar 2022 08:19:23 +0000 Subject: [PATCH 13/41] Address PR review --- src/Common/RangeGenerator.h | 2 +- src/IO/ReadBufferFromS3.cpp | 19 +++++++++++++------ src/IO/ReadBufferFromS3.h | 2 +- src/Storages/StorageS3.cpp | 17 ++++------------- src/Storages/StorageS3.h | 2 ++ 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/Common/RangeGenerator.h b/src/Common/RangeGenerator.h index 8d3b88936fc..8f097087c8c 100644 --- a/src/Common/RangeGenerator.h +++ b/src/Common/RangeGenerator.h @@ -34,7 +34,7 @@ public: Range range{from, to}; from = to; - return std::move(range); + return range; } private: diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index c78f41a0f9d..6616d92b492 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -1,4 +1,5 @@ #include +#include "IO/S3Common.h" #if USE_AWS_S3 @@ -212,13 +213,14 @@ std::optional ReadBufferFromS3::getTotalSize() if (file_size) return file_size; - Aws::S3::Model::HeadObjectRequest request; - request.SetBucket(bucket); - request.SetKey(key); + auto object_size = S3::getObjectSize(client_ptr, bucket, key, false); - auto outcome = client_ptr->HeadObject(request); - auto head_result = outcome.GetResultWithOwnership(); - file_size = head_result.GetContentLength(); + if (!object_size) + { + return std::nullopt; + } + + file_size = object_size; return file_size; } @@ -236,6 +238,11 @@ void ReadBufferFromS3::setReadUntilPosition(size_t position) } } +SeekableReadBuffer::Range ReadBufferFromS3::getRemainingReadRange() const +{ + return Range{.left = static_cast(offset), .right = read_until_position ? std::optional{read_until_position - 1} : std::nullopt}; +} + std::unique_ptr ReadBufferFromS3::initialize() { Aws::S3::Model::GetObjectRequest req; diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index dbba751b321..5c9d709d58e 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -63,7 +63,7 @@ public: void setReadUntilPosition(size_t position) override; - Range getRemainingReadRange() const override { return Range{.left = static_cast(offset), .right = read_until_position - 1}; } + Range getRemainingReadRange() const override; size_t getFileOffsetOfBufferEnd() const override { return offset; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index b40e352e1fb..411f78e44d6 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -307,7 +307,7 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k const bool object_too_small = object_size < download_thread_num * download_buffer_size; if (!use_parallel_download || object_too_small) { - LOG_TRACE(&Poco::Logger::get("StorageS3Source"), "Downloading object of size {} from S3 in single thread", object_size); + LOG_TRACE(log, "Downloading object of size {} from S3 in single thread", object_size); return std::make_unique(client, bucket, key, max_single_read_retries, getContext()->getReadSettings()); } @@ -315,29 +315,20 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k if (download_buffer_size < DBMS_DEFAULT_BUFFER_SIZE) { - LOG_WARNING( - &Poco::Logger::get("StorageS3Source"), - "Downloading buffer {} bytes too small, set at least {} bytes", - download_buffer_size, - DBMS_DEFAULT_BUFFER_SIZE); + LOG_WARNING(log, "Downloading buffer {} bytes too small, set at least {} bytes", download_buffer_size, DBMS_DEFAULT_BUFFER_SIZE); download_buffer_size = DBMS_DEFAULT_BUFFER_SIZE; } auto factory = std::make_unique( client, bucket, key, download_buffer_size, object_size, max_single_read_retries, getContext()->getReadSettings()); LOG_TRACE( - &Poco::Logger::get("StorageS3Source"), - "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", - download_thread_num, - object_size, - download_buffer_size); + log, "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", download_thread_num, object_size, download_buffer_size); ThreadGroupStatusPtr running_group = CurrentThread::isInitialized() && CurrentThread::get().getThreadGroup() ? CurrentThread::get().getThreadGroup() : MainThreadStatus::getInstance().getThreadGroup(); - ContextPtr query_context - = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr; + ContextPtr query_context = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr; auto worker_cleanup = [has_running_group = running_group == nullptr](ThreadStatus & thread_status) { diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 6362e777df6..e67223190ba 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -107,6 +107,8 @@ private: std::shared_ptr file_iterator; size_t download_thread_num = 1; + Poco::Logger * log = &Poco::Logger::get("StorageS3Source"); + /// Recreate ReadBuffer and BlockInputStream for each file. bool initialize(); From 065bc13aad41c6a564e5803d0408a91f640fec43 Mon Sep 17 00:00:00 2001 From: pdai Date: Mon, 28 Mar 2022 20:58:18 +0800 Subject: [PATCH 14/41] fix typo --- .../mergetree-family/versionedcollapsingmergetree.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md b/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md index b81d2206bf4..f5f2c428ea7 100644 --- a/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md +++ b/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md @@ -8,7 +8,7 @@ toc_title: "版本折叠MergeTree" 这个引擎: - 允许快速写入不断变化的对象状态。 -- 删除后台中的旧对象状态。 这显着降低了存储体积。 +- 删除后台中的旧对象状态。 这显著降低了存储体积。 请参阅部分 [崩溃](#table_engines_versionedcollapsingmergetree) 有关详细信息。 From 0e4af89f69e1340d27846f24faf14268ae0c4a10 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 28 Mar 2022 17:21:47 +0000 Subject: [PATCH 15/41] fix reading from type object --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 5 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- .../MergeTree/MergeTreeBlockReadUtils.cpp | 39 ++++++------ .../MergeTree/MergeTreeBlockReadUtils.h | 2 +- .../MergeTree/MergeTreeSequentialSource.cpp | 2 +- src/Storages/StorageSnapshot.cpp | 62 ++++++++++--------- src/Storages/StorageSnapshot.h | 4 ++ .../01825_type_json_missed_values.reference | 2 + .../01825_type_json_missed_values.sql | 19 ++++++ 9 files changed, 82 insertions(+), 55 deletions(-) create mode 100644 tests/queries/0_stateless/01825_type_json_missed_values.reference create mode 100644 tests/queries/0_stateless/01825_type_json_missed_values.sql diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 9028023dc80..6e1d73da427 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -558,9 +558,10 @@ size_t IMergeTreeDataPart::getFileSizeOrZero(const String & file_name) const return checksum->second.file_size; } -String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize(const StorageMetadataPtr & metadata_snapshot) const +String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize(const StorageSnapshotPtr & storage_snapshot) const { - const auto & storage_columns = metadata_snapshot->getColumns().getAllPhysical(); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects().withSubcolumns(); + auto storage_columns = storage_snapshot->getColumns(options); MergeTreeData::AlterConversions alter_conversions; if (!parent_part) alter_conversions = storage.getAlterConversionsForPart(shared_from_this()); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 85088b10f70..0dfac49bf30 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -164,7 +164,7 @@ public: /// Returns the name of a column with minimum compressed size (as returned by getColumnSize()). /// If no checksums are present returns the name of the first physically existing column. - String getColumnNameWithMinimumCompressedSize(const StorageMetadataPtr & metadata_snapshot) const; + String getColumnNameWithMinimumCompressedSize(const StorageSnapshotPtr & storage_snapshot) const; bool contains(const IMergeTreeDataPart & other) const { return info.contains(other.info); } diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index 6e72b843f10..97516ea06c6 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -24,7 +24,7 @@ namespace /// least one existing (physical) column in part. bool injectRequiredColumnsRecursively( const String & column_name, - const ColumnsDescription & storage_columns, + const StorageSnapshotPtr & storage_snapshot, const MergeTreeData::AlterConversions & alter_conversions, const MergeTreeData::DataPartPtr & part, Names & columns, @@ -36,7 +36,8 @@ bool injectRequiredColumnsRecursively( /// stages. checkStackSize(); - auto column_in_storage = storage_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns().withExtendedObjects(); + auto column_in_storage = storage_snapshot->tryGetColumn(options, column_name); if (column_in_storage) { auto column_name_in_part = column_in_storage->getNameInStorage(); @@ -63,7 +64,8 @@ bool injectRequiredColumnsRecursively( /// Column doesn't have default value and don't exist in part /// don't need to add to required set. - const auto column_default = storage_columns.getDefault(column_name); + auto metadata_snapshot = storage_snapshot->getMetadataForQuery(); + const auto column_default = metadata_snapshot->getColumns().getDefault(column_name); if (!column_default) return false; @@ -73,39 +75,36 @@ bool injectRequiredColumnsRecursively( bool result = false; for (const auto & identifier : identifiers) - result |= injectRequiredColumnsRecursively(identifier, storage_columns, alter_conversions, part, columns, required_columns, injected_columns); + result |= injectRequiredColumnsRecursively(identifier, storage_snapshot, alter_conversions, part, columns, required_columns, injected_columns); return result; } } -NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetadataPtr & metadata_snapshot, const MergeTreeData::DataPartPtr & part, Names & columns) +NameSet injectRequiredColumns( + const MergeTreeData & storage, + const StorageSnapshotPtr & storage_snapshot, + const MergeTreeData::DataPartPtr & part, + Names & columns) { NameSet required_columns{std::begin(columns), std::end(columns)}; NameSet injected_columns; bool have_at_least_one_physical_column = false; - - const auto & storage_columns = metadata_snapshot->getColumns(); MergeTreeData::AlterConversions alter_conversions; if (!part->isProjectionPart()) alter_conversions = storage.getAlterConversionsForPart(part); + for (size_t i = 0; i < columns.size(); ++i) { - auto name_in_storage = Nested::extractTableName(columns[i]); - if (storage_columns.has(name_in_storage) && isObject(storage_columns.get(name_in_storage).type)) - { - have_at_least_one_physical_column = true; - continue; - } - /// We are going to fetch only physical columns - if (!storage_columns.hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, columns[i])) - throw Exception("There is no physical column or subcolumn " + columns[i] + " in table.", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns().withExtendedObjects(); + if (!storage_snapshot->tryGetColumn(options, columns[i])) + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no physical column or subcolumn {} in table", columns[i]); have_at_least_one_physical_column |= injectRequiredColumnsRecursively( - columns[i], storage_columns, alter_conversions, + columns[i], storage_snapshot, alter_conversions, part, columns, required_columns, injected_columns); } @@ -115,7 +114,7 @@ NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetada */ if (!have_at_least_one_physical_column) { - const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(metadata_snapshot); + const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(storage_snapshot); columns.push_back(minimum_size_column_name); /// correctly report added column injected_columns.insert(columns.back()); @@ -271,7 +270,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( Names pre_column_names; /// inject columns required for defaults evaluation - bool should_reorder = !injectRequiredColumns(storage, storage_snapshot->getMetadataForQuery(), data_part, column_names).empty(); + bool should_reorder = !injectRequiredColumns(storage, storage_snapshot, data_part, column_names).empty(); if (prewhere_info) { @@ -296,7 +295,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( if (pre_column_names.empty()) pre_column_names.push_back(column_names[0]); - const auto injected_pre_columns = injectRequiredColumns(storage, storage_snapshot->getMetadataForQuery(), data_part, pre_column_names); + const auto injected_pre_columns = injectRequiredColumns(storage, storage_snapshot, data_part, pre_column_names); if (!injected_pre_columns.empty()) should_reorder = true; diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h index 2373881f954..b4293b4ce3d 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h @@ -22,7 +22,7 @@ using MergeTreeBlockSizePredictorPtr = std::shared_ptrrows_count); /// Add columns because we don't want to read empty blocks - injectRequiredColumns(storage, storage_snapshot->metadata, data_part, columns_to_read); + injectRequiredColumns(storage, storage_snapshot, data_part, columns_to_read); NamesAndTypesList columns_for_reader; if (take_column_types_from_storage) { diff --git a/src/Storages/StorageSnapshot.cpp b/src/Storages/StorageSnapshot.cpp index e214afc6a90..8a82c5387c5 100644 --- a/src/Storages/StorageSnapshot.cpp +++ b/src/Storages/StorageSnapshot.cpp @@ -51,40 +51,42 @@ NamesAndTypesList StorageSnapshot::getColumns(const GetColumnsOptions & options) NamesAndTypesList StorageSnapshot::getColumnsByNames(const GetColumnsOptions & options, const Names & names) const { NamesAndTypesList res; - const auto & columns = getMetadataForQuery()->getColumns(); for (const auto & name : names) + res.push_back(getColumn(options, name)); + return res; +} + +std::optional StorageSnapshot::tryGetColumn(const GetColumnsOptions & options, const String & column_name) const +{ + const auto & columns = getMetadataForQuery()->getColumns(); + auto column = columns.tryGetColumn(options, column_name); + if (column && (!isObject(column->type) || !options.with_extended_objects)) + return column; + + if (options.with_extended_objects) { - auto column = columns.tryGetColumn(options, name); - if (column && !isObject(column->type)) - { - res.emplace_back(std::move(*column)); - continue; - } - - if (options.with_extended_objects) - { - auto object_column = object_columns.tryGetColumn(options, name); - if (object_column) - { - res.emplace_back(std::move(*object_column)); - continue; - } - } - - if (options.with_virtuals) - { - auto it = virtual_columns.find(name); - if (it != virtual_columns.end()) - { - res.emplace_back(name, it->second); - continue; - } - } - - throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column {} in table", name); + auto object_column = object_columns.tryGetColumn(options, column_name); + if (object_column) + return object_column; } - return res; + if (options.with_virtuals) + { + auto it = virtual_columns.find(column_name); + if (it != virtual_columns.end()) + return NameAndTypePair(column_name, it->second); + } + + return {}; +} + +NameAndTypePair StorageSnapshot::getColumn(const GetColumnsOptions & options, const String & column_name) const +{ + auto column = tryGetColumn(options, column_name); + if (!column) + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column {} in table", column_name); + + return *column; } Block StorageSnapshot::getSampleBlockForColumns(const Names & column_names) const diff --git a/src/Storages/StorageSnapshot.h b/src/Storages/StorageSnapshot.h index 46244827f6c..909f4fd5cab 100644 --- a/src/Storages/StorageSnapshot.h +++ b/src/Storages/StorageSnapshot.h @@ -61,6 +61,10 @@ struct StorageSnapshot /// Get columns with types according to options only for requested names. NamesAndTypesList getColumnsByNames(const GetColumnsOptions & options, const Names & names) const; + /// Get column with type according to options for requested name. + std::optional tryGetColumn(const GetColumnsOptions & options, const String & column_name) const; + NameAndTypePair getColumn(const GetColumnsOptions & options, const String & column_name) const; + /// Block with ordinary + materialized + aliases + virtuals + subcolumns. Block getSampleBlockForColumns(const Names & column_names) const; diff --git a/tests/queries/0_stateless/01825_type_json_missed_values.reference b/tests/queries/0_stateless/01825_type_json_missed_values.reference new file mode 100644 index 00000000000..b480493995b --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_missed_values.reference @@ -0,0 +1,2 @@ +Tuple(foo Int8, k1 Int8, k2 Int8) +1 diff --git a/tests/queries/0_stateless/01825_type_json_missed_values.sql b/tests/queries/0_stateless/01825_type_json_missed_values.sql new file mode 100644 index 00000000000..2420ab7cf34 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_missed_values.sql @@ -0,0 +1,19 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS t_json; + +SET allow_experimental_object_type = 1; + +CREATE TABLE t_json(id UInt64, obj JSON) +ENGINE = MergeTree ORDER BY id +SETTINGS min_bytes_for_wide_part = 0; + +SYSTEM STOP MERGES t_json; + +INSERT INTO t_json SELECT number, '{"k1": 1, "k2": 2}' FROM numbers(1000000); +INSERT INTO t_json VALUES (1000001, '{"foo": 1}'); + +SELECT toTypeName(obj) FROM t_json LIMIT 1; +SELECT count() FROM t_json WHERE obj.foo != 0; + +DROP TABLE IF EXISTS t_json; From 24c0cf86d44682ea0d8fb967cb17837191381d82 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 28 Mar 2022 19:25:51 +0000 Subject: [PATCH 16/41] add function 'flattenTuple' --- src/Functions/flattenTuple.cpp | 68 +++++++++++++++++++ .../registerFunctionsMiscellaneous.cpp | 2 + .../0_stateless/02246_flatten_tuple.reference | 4 ++ .../0_stateless/02246_flatten_tuple.sql | 22 ++++++ 4 files changed, 96 insertions(+) create mode 100644 src/Functions/flattenTuple.cpp create mode 100644 tests/queries/0_stateless/02246_flatten_tuple.reference create mode 100644 tests/queries/0_stateless/02246_flatten_tuple.sql diff --git a/src/Functions/flattenTuple.cpp b/src/Functions/flattenTuple.cpp new file mode 100644 index 00000000000..f5d5b4cb773 --- /dev/null +++ b/src/Functions/flattenTuple.cpp @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +namespace +{ + +class FunctionFlattenTuple : public IFunction +{ +public: + static constexpr auto name = "flattenTuple"; + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 1; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto & type = arguments[0]; + const auto * type_tuple = checkAndGetDataType(type.get()); + if (!type_tuple || !type_tuple->haveExplicitNames()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Argument for function '{}' must be Named Tuple. Got '{}'", + getName(), type->getName()); + + auto [paths, types] = flattenTuple(type); + Names names; + names.reserve(paths.size()); + for (const auto & path : paths) + names.push_back(path.getPath()); + + return std::make_shared(types, names); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override + { + auto column = arguments.at(0).column; + if (!checkAndGetColumn(column.get())) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}. Expected ColumnTuple", + column->getName(), getName()); + + return flattenTuple(column); + } +}; + +} + +void registerFunctionFlattenTuple(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp index 76d61ce509a..40a8409cd15 100644 --- a/src/Functions/registerFunctionsMiscellaneous.cpp +++ b/src/Functions/registerFunctionsMiscellaneous.cpp @@ -80,6 +80,7 @@ void registerFunctionInitialQueryID(FunctionFactory & factory); void registerFunctionServerUUID(FunctionFactory &); void registerFunctionZooKeeperSessionUptime(FunctionFactory &); void registerFunctionGetOSKernelVersion(FunctionFactory &); +void registerFunctionFlattenTuple(FunctionFactory &); #if USE_ICU void registerFunctionConvertCharset(FunctionFactory &); @@ -166,6 +167,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory) registerFunctionServerUUID(factory); registerFunctionZooKeeperSessionUptime(factory); registerFunctionGetOSKernelVersion(factory); + registerFunctionFlattenTuple(factory); #if USE_ICU registerFunctionConvertCharset(factory); diff --git a/tests/queries/0_stateless/02246_flatten_tuple.reference b/tests/queries/0_stateless/02246_flatten_tuple.reference new file mode 100644 index 00000000000..0320150025d --- /dev/null +++ b/tests/queries/0_stateless/02246_flatten_tuple.reference @@ -0,0 +1,4 @@ +([1,2],['a','b'],3,'c',4) Tuple(`t1.a` Array(UInt32), `t1.s` Array(String), b UInt32, `t2.k` String, `t2.v` UInt32) +Tuple(id Int8, obj Tuple(k1 Int8, k2 Tuple(k3 String, k4 Nested(k5 Int8, k6 Int8)), some Int8), s String) Tuple(id Int8, `obj.k1` Int8, `obj.k2.k3` String, `obj.k2.k4.k5` Array(Int8), `obj.k2.k4.k6` Array(Int8), `obj.some` Int8, s String) +1 1 2 [3,4] [0,0] 0 foo +2 0 str [0] [55] 42 bar diff --git a/tests/queries/0_stateless/02246_flatten_tuple.sql b/tests/queries/0_stateless/02246_flatten_tuple.sql new file mode 100644 index 00000000000..e73f382b261 --- /dev/null +++ b/tests/queries/0_stateless/02246_flatten_tuple.sql @@ -0,0 +1,22 @@ +DROP TABLE IF EXISTS t_flatten_tuple; +DROP TABLE IF EXISTS t_flatten_object; + +SET flatten_nested = 0; + +CREATE TABLE t_flatten_tuple(t Tuple(t1 Nested(a UInt32, s String), b UInt32, t2 Tuple(k String, v UInt32))) ENGINE = Memory; + +INSERT INTO t_flatten_tuple VALUES (([(1, 'a'), (2, 'b')], 3, ('c', 4))); + +SELECT flattenTuple(t) AS ft, toTypeName(ft) FROM t_flatten_tuple; + +SET allow_experimental_object_type = 1; +CREATE TABLE t_flatten_object(data JSON) ENGINE = Memory; + +INSERT INTO t_flatten_object VALUES ('{"id": 1, "obj": {"k1": 1, "k2": {"k3": 2, "k4": [{"k5": 3}, {"k5": 4}]}}, "s": "foo"}'); +INSERT INTO t_flatten_object VALUES ('{"id": 2, "obj": {"k2": {"k3": "str", "k4": [{"k6": 55}]}, "some": 42}, "s": "bar"}'); + +SELECT toTypeName(data), toTypeName(flattenTuple(data)) FROM t_flatten_object LIMIT 1; +SELECT untuple(flattenTuple(data)) FROM t_flatten_object; + +DROP TABLE IF EXISTS t_flatten_tuple; +DROP TABLE IF EXISTS t_flatten_object; From cd3cc60d5dbdf05bcf585bac7b2bd4c71d1177ed Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 28 Mar 2022 21:13:20 +0000 Subject: [PATCH 17/41] fix fasttest --- tests/queries/0_stateless/02246_flatten_tuple.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02246_flatten_tuple.sql b/tests/queries/0_stateless/02246_flatten_tuple.sql index e73f382b261..927e3b9ec80 100644 --- a/tests/queries/0_stateless/02246_flatten_tuple.sql +++ b/tests/queries/0_stateless/02246_flatten_tuple.sql @@ -1,3 +1,5 @@ +-- Tags: no-fasttest + DROP TABLE IF EXISTS t_flatten_tuple; DROP TABLE IF EXISTS t_flatten_object; From 98fe1ca1ecefed61b3ea4b4416aef4988546ab2c Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 29 Mar 2022 12:02:21 +0200 Subject: [PATCH 18/41] Fix --- programs/local/LocalServer.cpp | 9 +++---- ...6_clickhouse_local_drop_database.reference | 0 .../02246_clickhouse_local_drop_database.sh | 24 +++++++++++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/02246_clickhouse_local_drop_database.reference create mode 100755 tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 26d42a11315..bb6684ca137 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -184,6 +184,11 @@ void LocalServer::tryInitPath() if (path.back() != '/') path += '/'; + fs::create_directories(fs::path(path) / "user_defined/"); + fs::create_directories(fs::path(path) / "data/"); + fs::create_directories(fs::path(path) / "metadata/"); + fs::create_directories(fs::path(path) / "metadata_dropped/"); + global_context->setPath(path); global_context->setTemporaryStorage(path + "tmp"); @@ -565,7 +570,6 @@ void LocalServer::processConfig() /// Lock path directory before read status.emplace(fs::path(path) / "status", StatusFile::write_full_info); - fs::create_directories(fs::path(path) / "user_defined/"); LOG_DEBUG(log, "Loading user defined objects from {}", path); Poco::File(path + "user_defined/").createDirectories(); UserDefinedSQLObjectsLoader::instance().loadObjects(global_context); @@ -573,9 +577,6 @@ void LocalServer::processConfig() LOG_DEBUG(log, "Loaded user defined objects."); LOG_DEBUG(log, "Loading metadata from {}", path); - fs::create_directories(fs::path(path) / "data/"); - fs::create_directories(fs::path(path) / "metadata/"); - loadMetadataSystem(global_context); attachSystemTablesLocal(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::SYSTEM_DATABASE)); attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA)); diff --git a/tests/queries/0_stateless/02246_clickhouse_local_drop_database.reference b/tests/queries/0_stateless/02246_clickhouse_local_drop_database.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh b/tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh new file mode 100755 index 00000000000..00f3904192f --- /dev/null +++ b/tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +dir=${CLICKHOUSE_TEST_UNIQUE_NAME} +[[ -d $dir ]] && rm -r $dir +mkdir $dir +$CLICKHOUSE_LOCAL --multiline --multiquery --path $dir -q """ +DROP DATABASE IF EXISTS test; +CREATE DATABASE IF NOT EXISTS test; +USE test; +CREATE TABLE test (id Int32) ENGINE=MergeTree() ORDER BY id; +DROP DATABASE test; +""" + +$CLICKHOUSE_LOCAL --multiline --multiquery -q """ +DROP DATABASE IF EXISTS test; +CREATE DATABASE IF NOT EXISTS test; +USE test; +CREATE TABLE test (id Int32) ENGINE=MergeTree() ORDER BY id; +DROP DATABASE test; +""" From f0b4a9fd15642240619dddfd4ebd5419b8b54e79 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 29 Mar 2022 10:56:49 +0000 Subject: [PATCH 19/41] Fix construction of ReadBufferFromS3 --- src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 8f91804bbbe..0d50b24f7a5 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -44,7 +44,7 @@ SeekableReadBufferPtr ReadBufferFromS3Gather::createImplementationBuffer(const S { return std::make_unique( client_ptr, bucket, fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries, - settings, /* use_external_buffer */true, read_until_position, /* restricted_seek */true); + settings, /* use_external_buffer */true, /* offset */ 0, read_until_position, /* restricted_seek */true); }; if (with_cache) From c59926dcc7e3bb52be7b9b86ead120ae0d8b640d Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 29 Mar 2022 12:13:02 +0000 Subject: [PATCH 20/41] fix tests --- tests/queries/0_stateless/02246_flatten_tuple.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02246_flatten_tuple.sql b/tests/queries/0_stateless/02246_flatten_tuple.sql index 927e3b9ec80..139f7a621ef 100644 --- a/tests/queries/0_stateless/02246_flatten_tuple.sql +++ b/tests/queries/0_stateless/02246_flatten_tuple.sql @@ -18,7 +18,7 @@ INSERT INTO t_flatten_object VALUES ('{"id": 1, "obj": {"k1": 1, "k2": {"k3": 2, INSERT INTO t_flatten_object VALUES ('{"id": 2, "obj": {"k2": {"k3": "str", "k4": [{"k6": 55}]}, "some": 42}, "s": "bar"}'); SELECT toTypeName(data), toTypeName(flattenTuple(data)) FROM t_flatten_object LIMIT 1; -SELECT untuple(flattenTuple(data)) FROM t_flatten_object; +SELECT untuple(flattenTuple(data)) FROM t_flatten_object ORDER BY data.id; DROP TABLE IF EXISTS t_flatten_tuple; DROP TABLE IF EXISTS t_flatten_object; From 3246261da8a3152bc0ecf0c8855120454d76877f Mon Sep 17 00:00:00 2001 From: tavplubix Date: Tue, 29 Mar 2022 15:43:42 +0300 Subject: [PATCH 21/41] Fix logging in `test_distributed_respect_user_timeouts` (#35575) * Update test.py * Update test.py * Update test.py Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- .../integration/test_distributed_respect_user_timeouts/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_distributed_respect_user_timeouts/test.py b/tests/integration/test_distributed_respect_user_timeouts/test.py index 9cf7082d63a..567377aba0b 100644 --- a/tests/integration/test_distributed_respect_user_timeouts/test.py +++ b/tests/integration/test_distributed_respect_user_timeouts/test.py @@ -94,7 +94,7 @@ def _check_exception(exception, expected_tries=3): @pytest.fixture(scope="module", params=["configs", "configs_secure"]) def started_cluster(request): - cluster = ClickHouseCluster(__file__) + cluster = ClickHouseCluster(__file__, request.param) cluster.__with_ssl_config = request.param == "configs_secure" main_configs = [] main_configs += [os.path.join(request.param, "config.d/remote_servers.xml")] From 9990abb76afad4ac7c65895976d777ded0ca9d1c Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 28 Mar 2022 09:48:17 +0000 Subject: [PATCH 22/41] Use compile-time check for Exception messages, fix wrong messages --- src/Common/Exception.h | 22 +++++++++---------- src/Databases/DatabaseReplicatedWorker.cpp | 8 +++++-- src/Dictionaries/XDBCDictionarySource.cpp | 2 +- src/Disks/IO/CachedReadBufferFromRemoteFS.cpp | 9 ++++++-- src/Formats/FormatSchemaInfo.cpp | 9 +++++--- src/Functions/FunctionsStringArray.h | 2 +- src/Functions/IFunction.cpp | 9 +++++--- src/Functions/castOrDefault.cpp | 2 +- src/Functions/dateName.cpp | 2 +- src/Interpreters/CatBoostModel.cpp | 1 + src/Interpreters/DatabaseCatalog.cpp | 8 +++---- src/Interpreters/executeDDLQueryOnCluster.cpp | 19 ++++++++++------ .../Formats/Impl/MsgPackRowInputFormat.cpp | 4 ++-- 13 files changed, 59 insertions(+), 38 deletions(-) diff --git a/src/Common/Exception.h b/src/Common/Exception.h index 0bf89e7a447..c9bb8cc9f12 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -35,10 +35,10 @@ public: {} // Format message with fmt::format, like the logging functions. - template - Exception(int code, const std::string & fmt, Args&&... args) - : Exception(fmt::format(fmt::runtime(fmt), std::forward(args)...), code) - {} + template + Exception(int code, fmt::format_string fmt, Args &&... args) : Exception(fmt::format(fmt, std::forward(args)...), code) + { + } struct CreateFromPocoTag {}; struct CreateFromSTDTag {}; @@ -52,10 +52,10 @@ public: const char * what() const throw() override { return message().data(); } /// Add something to the existing message. - template - void addMessage(const std::string& format, Args&&... args) + template + void addMessage(fmt::format_string format, Args &&... args) { - extendedMessage(fmt::format(fmt::runtime(format), std::forward(args)...)); + extendedMessage(fmt::format(format, std::forward(args)...)); } void addMessage(const std::string& message) @@ -117,10 +117,10 @@ public: ParsingException(int code, const std::string & message); // Format message with fmt::format, like the logging functions. - template - ParsingException(int code, const std::string & fmt, Args&&... args) - : Exception(fmt::format(fmt::runtime(fmt), std::forward(args)...), code) - {} + template + ParsingException(int code, fmt::format_string fmt, Args &&... args) : Exception(code, fmt, std::forward(args)...) + { + } std::string displayText() const diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 365a5d02816..b45cfb16362 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -179,8 +179,12 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr if (!task->was_executed) { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} was executed, but was not committed: code {}: {}", - task->execution_status.code, task->execution_status.message); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Entry {} was executed, but was not committed: code {}: {}", + task->entry_name, + task->execution_status.code, + task->execution_status.message); } try_node->setAlreadyRemoved(); diff --git a/src/Dictionaries/XDBCDictionarySource.cpp b/src/Dictionaries/XDBCDictionarySource.cpp index f08abcdc516..764a7072ca0 100644 --- a/src/Dictionaries/XDBCDictionarySource.cpp +++ b/src/Dictionaries/XDBCDictionarySource.cpp @@ -50,7 +50,7 @@ namespace { if (!qualified_name.database.empty()) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Dictionary source of type {} specifies a schema but schema is not supported by {}-driver", + "Dictionary source specifies a schema but schema is not supported by {}-driver", bridge_.getName()); } diff --git a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp index 5cab2cb2995..4b614cd10e0 100644 --- a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp +++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp @@ -392,8 +392,13 @@ void CachedReadBufferFromRemoteFS::predownload(FileSegmentPtr & file_segment) if (bytes_to_predownload) throw Exception( ErrorCodes::LOGICAL_ERROR, - "Failed to predownload remaining {} bytes. Current file segment: {}, current download offset: {}, expected: {}, eof: {}", - file_segment->range().toString(), file_segment->getDownloadOffset(), file_offset_of_buffer_end, implementation_buffer->eof()); + "Failed to predownload remaining {} bytes. Current file segment: {}, current download offset: {}, expected: {}, " + "eof: {}", + bytes_to_predownload, + file_segment->range().toString(), + file_segment->getDownloadOffset(), + file_offset_of_buffer_end, + implementation_buffer->eof()); auto result = implementation_buffer->hasPendingData(); diff --git a/src/Formats/FormatSchemaInfo.cpp b/src/Formats/FormatSchemaInfo.cpp index 24c8dfc14f2..c1157be6e7a 100644 --- a/src/Formats/FormatSchemaInfo.cpp +++ b/src/Formats/FormatSchemaInfo.cpp @@ -85,9 +85,12 @@ FormatSchemaInfo::FormatSchemaInfo(const String & format_schema, const String & else if (path.has_parent_path() && !fs::weakly_canonical(default_schema_directory_path / path).string().starts_with(fs::weakly_canonical(default_schema_directory_path).string())) { if (is_server) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})", - path.string()); + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})", + default_schema_directory(), + path.string(), + default_schema_directory()); path = default_schema_directory_path / path; schema_path = path.filename(); schema_directory = path.parent_path() / ""; diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index a1256598f1b..6b3adf46ff5 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -259,7 +259,7 @@ public: throw Exception( ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function '{}' needs at least 2 arguments, at most 3 arguments; passed {}.", - arguments.size()); + name, arguments.size()); if (!isString(arguments[0])) throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.", diff --git a/src/Functions/IFunction.cpp b/src/Functions/IFunction.cpp index cfb4e12a025..19638c78daf 100644 --- a/src/Functions/IFunction.cpp +++ b/src/Functions/IFunction.cpp @@ -181,9 +181,12 @@ ColumnPtr IExecutableFunction::defaultImplementationForNulls( // Default implementation for nulls returns null result for null arguments, // so the result type must be nullable. if (!result_type->isNullable()) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Function {} with Null argument and default implementation for Nulls " - "is expected to return Nullable result, got {}", result_type->getName()); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Function {} with Null argument and default implementation for Nulls " + "is expected to return Nullable result, got {}", + getName(), + result_type->getName()); return result_type->createColumnConstWithDefaultValue(input_rows_count); } diff --git a/src/Functions/castOrDefault.cpp b/src/Functions/castOrDefault.cpp index 628ac57f34d..f7b93ec2e83 100644 --- a/src/Functions/castOrDefault.cpp +++ b/src/Functions/castOrDefault.cpp @@ -231,7 +231,7 @@ private: { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} decimal scale should have native UInt type. Actual {}", - scale_argument.type->getName()); + getName(), scale_argument.type->getName()); } scale = arguments[additional_argument_index].column->getUInt(0); diff --git a/src/Functions/dateName.cpp b/src/Functions/dateName.cpp index eef9bc3955b..91ea8995777 100644 --- a/src/Functions/dateName.cpp +++ b/src/Functions/dateName.cpp @@ -112,7 +112,7 @@ public: || (res = executeType(arguments, result_type)))) throw Exception( ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {} of function {], must be Date or DateTime.", + "Illegal column {} of function {}, must be Date or DateTime.", arguments[1].column->getName(), getName()); diff --git a/src/Interpreters/CatBoostModel.cpp b/src/Interpreters/CatBoostModel.cpp index cffaa81c4f0..d5803ed9e36 100644 --- a/src/Interpreters/CatBoostModel.cpp +++ b/src/Interpreters/CatBoostModel.cpp @@ -169,6 +169,7 @@ public: if (columns.size() != float_features_count + cat_features_count) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of columns is different with number of features: columns size {} float features size {} + cat features size {}", + columns.size(), float_features_count, cat_features_count); diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 360a5d430e0..2f51d942403 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -233,7 +233,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( { assert(!db_and_table.first && !db_and_table.second); if (exception) - exception->emplace(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs()); + exception->emplace(fmt::format("Table {} doesn't exist", table_id.getNameForLogs()), ErrorCodes::UNKNOWN_TABLE); return {}; } @@ -263,7 +263,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( /// If table_id has no UUID, then the name of database was specified by user and table_id was not resolved through context. /// Do not allow access to TEMPORARY_DATABASE because it contains all temporary tables of all contexts and users. if (exception) - exception->emplace(ErrorCodes::DATABASE_ACCESS_DENIED, "Direct access to `{}` database is not allowed", String(TEMPORARY_DATABASE)); + exception->emplace(fmt::format("Direct access to `{}` database is not allowed", TEMPORARY_DATABASE), ErrorCodes::DATABASE_ACCESS_DENIED); return {}; } @@ -274,7 +274,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( if (databases.end() == it) { if (exception) - exception->emplace(ErrorCodes::UNKNOWN_DATABASE, "Database {} doesn't exist", backQuoteIfNeed(table_id.getDatabaseName())); + exception->emplace(fmt::format("Database {} doesn't exist", backQuoteIfNeed(table_id.getDatabaseName())), ErrorCodes::UNKNOWN_DATABASE); return {}; } database = it->second; @@ -282,7 +282,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( auto table = database->tryGetTable(table_id.table_name, context_); if (!table && exception) - exception->emplace(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs()); + exception->emplace(fmt::format("Table {} doesn't exist", table_id.getNameForLogs()), ErrorCodes::UNKNOWN_TABLE); if (!table) database = nullptr; diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index ce00676b2ed..f0279bafca2 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -320,12 +320,13 @@ Chunk DDLQueryStatusSource::generate() if (throw_on_timeout) { if (!first_exception) - first_exception = std::make_unique(ErrorCodes::TIMEOUT_EXCEEDED, msg_format, - node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + first_exception = std::make_unique( + fmt::format(msg_format, node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts), + ErrorCodes::TIMEOUT_EXCEEDED); return {}; } - LOG_INFO(log, fmt::runtime(msg_format), node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + LOG_INFO(log, msg_format, node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); NameSet unfinished_hosts = waiting_hosts; for (const auto & host_id : finished_hosts) @@ -358,9 +359,12 @@ Chunk DDLQueryStatusSource::generate() /// Paradoxically, this exception will be throw even in case of "never_throw" mode. if (!first_exception) - first_exception = std::make_unique(ErrorCodes::UNFINISHED, - "Cannot provide query execution status. The query's node {} has been deleted by the cleaner" - " since it was finished (or its lifetime is expired)", node_path); + first_exception = std::make_unique( + fmt::format( + "Cannot provide query execution status. The query's node {} has been deleted by the cleaner" + " since it was finished (or its lifetime is expired)", + node_path), + ErrorCodes::UNFINISHED); return {}; } @@ -386,7 +390,8 @@ Chunk DDLQueryStatusSource::generate() if (status.code != 0 && !first_exception && context->getSettingsRef().distributed_ddl_output_mode != DistributedDDLOutputMode::NEVER_THROW) { - first_exception = std::make_unique(status.code, "There was an error on [{}:{}]: {}", host, port, status.message); + first_exception = std::make_unique( + fmt::format("There was an error on [{}:{}]: {}", host, port, status.message), status.code); } ++num_hosts_finished; diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index 607e6f36767..722cedbab30 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -359,7 +359,7 @@ bool MsgPackVisitor::visit_ext(const char * value, uint32_t size) return true; } - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported MsgPack extension type: {%x}", type); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported MsgPack extension type: {:x}", type); } void MsgPackVisitor::parse_error(size_t, size_t) // NOLINT @@ -498,7 +498,7 @@ DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) msgpack::object_ext object_ext = object.via.ext; if (object_ext.type() == int8_t(MsgPackExtensionTypes::UUIDType)) return std::make_shared(); - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack extension type {%x} is not supported", object_ext.type()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack extension type {:x} is not supported", object_ext.type()); } } __builtin_unreachable(); From ef484547bf9927c222c588113487a1534e61e39a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 29 Mar 2022 10:14:38 +0000 Subject: [PATCH 23/41] Fix GCC build --- src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp b/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp index a72d7b12a4f..ebfeaed8346 100644 --- a/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp +++ b/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp @@ -1,10 +1,13 @@ +// Needs to go first because its partial specialization of fmt::formatter +// should be defined before any instantiation +#include + #include #include #include #include -#include #include namespace DB From c9feb7f7515cf2711877fa40844a1b55b4e03267 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 29 Mar 2022 13:40:14 +0000 Subject: [PATCH 24/41] Format test file --- tests/integration/test_allowed_url_from_config/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_allowed_url_from_config/test.py b/tests/integration/test_allowed_url_from_config/test.py index 13f3929902d..01a2a500ebf 100644 --- a/tests/integration/test_allowed_url_from_config/test.py +++ b/tests/integration/test_allowed_url_from_config/test.py @@ -280,4 +280,4 @@ def test_HDFS(start_cluster): def test_schema_inference(start_cluster): error = node7.query_and_get_error("desc url('http://test.com`, 'TSVRaw'')") - assert(error.find('ReadWriteBufferFromHTTPBase') == -1) + assert error.find("ReadWriteBufferFromHTTPBase") == -1 From d0f01516db0c2d403f9d45898251bd417e78ad54 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 15:48:57 +0200 Subject: [PATCH 25/41] Resurrect automatic labelling --- .github/workflows/master.yml | 10 - tests/ci/cherry_pick_utils/parser.py | 8 - tests/ci/commit_status_helper.py | 5 + tests/ci/run_check.py | 34 +- utils/github/__init__.py | 1 - utils/github/backport.py | 185 ---------- utils/github/cherrypick.py | 323 ------------------ utils/github/local.py | 108 ------ utils/github/parser.py | 64 ---- utils/github/query.py | 492 --------------------------- 10 files changed, 31 insertions(+), 1199 deletions(-) delete mode 100644 utils/github/__init__.py delete mode 100644 utils/github/backport.py delete mode 100644 utils/github/cherrypick.py delete mode 100644 utils/github/local.py delete mode 100644 utils/github/parser.py delete mode 100644 utils/github/query.py diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 73cffc1d5d3..cfa95b84ee5 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -149,7 +149,6 @@ jobs: sudo rm -fr "$TEMP_PATH" SplitBuildSmokeTest: needs: [BuilderDebSplitted] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, style-checker] steps: - name: Set envs @@ -316,7 +315,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinRelease: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -362,7 +360,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinGCC: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -636,7 +633,6 @@ jobs: ########################################################################################## BuilderDebSplitted: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -682,7 +678,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinTidy: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -728,7 +723,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinDarwin: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -774,7 +768,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinAarch64: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -820,7 +813,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinFreeBSD: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -866,7 +858,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinDarwinAarch64: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs @@ -912,7 +903,6 @@ jobs: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinPPC64: needs: [DockerHubPush] - if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} runs-on: [self-hosted, builder] steps: - name: Set envs diff --git a/tests/ci/cherry_pick_utils/parser.py b/tests/ci/cherry_pick_utils/parser.py index d8348e6d964..29c05e5328f 100644 --- a/tests/ci/cherry_pick_utils/parser.py +++ b/tests/ci/cherry_pick_utils/parser.py @@ -20,8 +20,6 @@ class Description: def __init__(self, pull_request): self.label_name = str() - self.legal = False - self._parse(pull_request["bodyText"]) def _parse(self, text): @@ -39,12 +37,6 @@ class Description: category = stripped next_category = False - if ( - stripped - == "I hereby agree to the terms of the CLA available at: https://yandex.ru/legal/cla/?lang=en" - ): - self.legal = True - category_headers = ( "Category (leave one):", "Changelog category (leave one):", diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index e379c9a2254..53dfb145b44 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -59,3 +59,8 @@ def post_commit_status_to_file(file_path, description, state, report_url): with open(file_path, "w", encoding="utf-8") as f: out = csv.writer(f, delimiter="\t") out.writerow([state, report_url, description]) + +def post_label(gh, pr_info, label_name): + repo = gh.get_repo(GITHUB_REPOSITORY) + pull_request = repo.get_pull(pr_info.number) + pull_request.add_to_labels(label_name) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 9c7ba13f8e4..74e60582e8d 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -8,7 +8,7 @@ from github import Github from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL from pr_info import PRInfo from get_robot_token import get_best_robot_token -from commit_status_helper import get_commit +from commit_status_helper import get_commit, post_label NAME = "Run Check (actions)" @@ -81,6 +81,21 @@ TRUSTED_CONTRIBUTORS = { ] } +MAP_CATEGORY_TO_LABEL = { + "New Feature": "pr-feature", + "Bug Fix": "pr-bugfix", + "Bug Fix (user-visible misbehaviour in official stable or prestable release)": "pr-bugfix", + "Improvement": "pr-improvement", + "Performance Improvement": "pr-performance", + "Backward Incompatible Change": "pr-backward-incompatible", + "Build/Testing/Packaging Improvement": "pr-build", + "Not for changelog (changelog entry is not required)": "pr-not-for-changelog", + "Not for changelog": "pr-not-for-changelog", + "Documentation (changelog entry is not required)": "pr-documentation", + "Documentation": "pr-documentation", + # 'Other': doesn't match anything +} + def pr_is_by_trusted_user(pr_user_login, pr_user_orgs): if pr_user_login.lower() in TRUSTED_CONTRIBUTORS: @@ -168,7 +183,7 @@ def check_pr_description(pr_info): + second_category + "'" ) - return result_status[:140] + return result_status[:140], category elif re.match( r"(?i)^[>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] @@ -190,19 +205,19 @@ def check_pr_description(pr_info): i += 1 if not category: - return "Changelog category is empty" + return "Changelog category is empty", category # Filter out the PR categories that are not for changelog. if re.match( r"(?i)doc|((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)", category, ): - return "" + return "", category if not entry: - return f"Changelog entry required for category '{category}'" + return f"Changelog entry required for category '{category}'", category - return "" + return "", category if __name__ == "__main__": @@ -213,7 +228,10 @@ if __name__ == "__main__": gh = Github(get_best_robot_token()) commit = get_commit(gh, pr_info.sha) - description_report = check_pr_description(pr_info)[:139] + description_report, category = check_pr_description(pr_info) + if category in MAP_CATEGORY_TO_LABEL: + post_label(gh, pr_info, MAP_CATEGORY_TO_LABEL[category]) + if description_report: print("::notice ::Cannot run, description does not match the template") logging.info( @@ -225,7 +243,7 @@ if __name__ == "__main__": ) commit.create_status( context=NAME, - description=description_report, + description=description_report[:139], state="failure", target_url=url, ) diff --git a/utils/github/__init__.py b/utils/github/__init__.py deleted file mode 100644 index 40a96afc6ff..00000000000 --- a/utils/github/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- diff --git a/utils/github/backport.py b/utils/github/backport.py deleted file mode 100644 index 615c0d19ffa..00000000000 --- a/utils/github/backport.py +++ /dev/null @@ -1,185 +0,0 @@ -# -*- coding: utf-8 -*- - -try: - from clickhouse.utils.github.cherrypick import CherryPick - from clickhouse.utils.github.query import Query as RemoteRepo - from clickhouse.utils.github.local import Repository as LocalRepo -except: - from .cherrypick import CherryPick - from .query import Query as RemoteRepo - from .local import Repository as LocalRepo - -import argparse -import logging -import re -import sys - - -class Backport: - def __init__(self, token, owner, name, team): - self._gh = RemoteRepo( - token, owner=owner, name=name, team=team, max_page_size=30, min_page_size=7 - ) - self._token = token - self.default_branch_name = self._gh.default_branch - self.ssh_url = self._gh.ssh_url - - def getPullRequests(self, from_commit): - return self._gh.get_pull_requests(from_commit) - - def getBranchesWithRelease(self): - branches = set() - for pull_request in self._gh.find_pull_requests("release"): - branches.add(pull_request["headRefName"]) - return branches - - def execute(self, repo, upstream, until_commit, run_cherrypick): - repo = LocalRepo(repo, upstream, self.default_branch_name) - all_branches = repo.get_release_branches() # [(branch_name, base_commit)] - - release_branches = self.getBranchesWithRelease() - - branches = [] - # iterate over all branches to preserve their precedence. - for branch in all_branches: - if branch[0] in release_branches: - branches.append(branch) - - if not branches: - logging.info("No release branches found!") - return - - for branch in branches: - logging.info("Found release branch: %s", branch[0]) - - if not until_commit: - until_commit = branches[0][1] - pull_requests = self.getPullRequests(until_commit) - - backport_map = {} - - RE_MUST_BACKPORT = re.compile(r"^v(\d+\.\d+)-must-backport$") - RE_NO_BACKPORT = re.compile(r"^v(\d+\.\d+)-no-backport$") - RE_BACKPORTED = re.compile(r"^v(\d+\.\d+)-backported$") - - # pull-requests are sorted by ancestry from the most recent. - for pr in pull_requests: - while repo.comparator(branches[-1][1]) >= repo.comparator( - pr["mergeCommit"]["oid"] - ): - logging.info( - "PR #{} is already inside {}. Dropping this branch for further PRs".format( - pr["number"], branches[-1][0] - ) - ) - branches.pop() - - logging.info("Processing PR #{}".format(pr["number"])) - - assert len(branches) - - branch_set = set([branch[0] for branch in branches]) - - # First pass. Find all must-backports - for label in pr["labels"]["nodes"]: - if label["name"] == "pr-must-backport": - backport_map[pr["number"]] = branch_set.copy() - continue - matched = RE_MUST_BACKPORT.match(label["name"]) - if matched: - if pr["number"] not in backport_map: - backport_map[pr["number"]] = set() - backport_map[pr["number"]].add(matched.group(1)) - - # Second pass. Find all no-backports - for label in pr["labels"]["nodes"]: - if label["name"] == "pr-no-backport" and pr["number"] in backport_map: - del backport_map[pr["number"]] - break - matched_no_backport = RE_NO_BACKPORT.match(label["name"]) - matched_backported = RE_BACKPORTED.match(label["name"]) - if ( - matched_no_backport - and pr["number"] in backport_map - and matched_no_backport.group(1) in backport_map[pr["number"]] - ): - backport_map[pr["number"]].remove(matched_no_backport.group(1)) - logging.info( - "\tskipping %s because of forced no-backport", - matched_no_backport.group(1), - ) - elif ( - matched_backported - and pr["number"] in backport_map - and matched_backported.group(1) in backport_map[pr["number"]] - ): - backport_map[pr["number"]].remove(matched_backported.group(1)) - logging.info( - "\tskipping %s because it's already backported manually", - matched_backported.group(1), - ) - - for pr, branches in list(backport_map.items()): - logging.info("PR #%s needs to be backported to:", pr) - for branch in branches: - logging.info( - "\t%s, and the status is: %s", - branch, - run_cherrypick(self._token, pr, branch), - ) - - # print API costs - logging.info("\nGitHub API total costs per query:") - for name, value in list(self._gh.api_costs.items()): - logging.info("%s : %s", name, value) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--token", type=str, required=True, help="token for Github access" - ) - parser.add_argument( - "--repo", - type=str, - required=True, - help="path to full repository", - metavar="PATH", - ) - parser.add_argument( - "--til", type=str, help="check PRs from HEAD til this commit", metavar="COMMIT" - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="do not create or merge any PRs", - default=False, - ) - parser.add_argument( - "--verbose", - "-v", - action="store_true", - help="more verbose output", - default=False, - ) - parser.add_argument( - "--upstream", - "-u", - type=str, - help="remote name of upstream in repository", - default="origin", - ) - args = parser.parse_args() - - if args.verbose: - logging.basicConfig( - format="%(message)s", stream=sys.stdout, level=logging.DEBUG - ) - else: - logging.basicConfig(format="%(message)s", stream=sys.stdout, level=logging.INFO) - - cherrypick_run = lambda token, pr, branch: CherryPick( - token, "ClickHouse", "ClickHouse", "core", pr, branch - ).execute(args.repo, args.dry_run) - bp = Backport(args.token, "ClickHouse", "ClickHouse", "core") - bp.execute(args.repo, args.upstream, args.til, cherrypick_run) diff --git a/utils/github/cherrypick.py b/utils/github/cherrypick.py deleted file mode 100644 index c6469fa62a9..00000000000 --- a/utils/github/cherrypick.py +++ /dev/null @@ -1,323 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Backports changes from PR to release branch. -Requires multiple separate runs as part of the implementation. - -First run should do the following: -1. Merge release branch with a first parent of merge-commit of PR (using 'ours' strategy). (branch: backport/{branch}/{pr}) -2. Create temporary branch over merge-commit to use it for PR creation. (branch: cherrypick/{merge_commit}) -3. Create PR from temporary branch to backport branch (emulating cherry-pick). - -Second run checks PR from previous run to be merged or at least being mergeable. If it's not merged then try to merge it. - -Third run creates PR from backport branch (with merged previous PR) to release branch. -""" - -try: - from clickhouse.utils.github.query import Query as RemoteRepo -except: - from .query import Query as RemoteRepo - -import argparse -from enum import Enum -import logging -import os -import subprocess -import sys - - -class CherryPick: - class Status(Enum): - DISCARDED = "discarded" - NOT_INITIATED = "not started" - FIRST_MERGEABLE = "waiting for 1st stage" - FIRST_CONFLICTS = "conflicts on 1st stage" - SECOND_MERGEABLE = "waiting for 2nd stage" - SECOND_CONFLICTS = "conflicts on 2nd stage" - MERGED = "backported" - - def _run(self, args): - out = subprocess.check_output(args).rstrip() - logging.debug(out) - return out - - def __init__(self, token, owner, name, team, pr_number, target_branch): - self._gh = RemoteRepo(token, owner=owner, name=name, team=team) - self._pr = self._gh.get_pull_request(pr_number) - - self.ssh_url = self._gh.ssh_url - - # TODO: check if pull-request is merged. - - self.merge_commit_oid = self._pr["mergeCommit"]["oid"] - - self.target_branch = target_branch - self.backport_branch = "backport/{branch}/{pr}".format( - branch=target_branch, pr=pr_number - ) - self.cherrypick_branch = "cherrypick/{branch}/{oid}".format( - branch=target_branch, oid=self.merge_commit_oid - ) - - def getCherryPickPullRequest(self): - return self._gh.find_pull_request( - base=self.backport_branch, head=self.cherrypick_branch - ) - - def createCherryPickPullRequest(self, repo_path): - DESCRIPTION = ( - "This pull-request is a first step of an automated backporting.\n" - "It contains changes like after calling a local command `git cherry-pick`.\n" - "If you intend to continue backporting this changes, then resolve all conflicts if any.\n" - "Otherwise, if you do not want to backport them, then just close this pull-request.\n" - "\n" - "The check results does not matter at this step - you can safely ignore them.\n" - "Also this pull-request will be merged automatically as it reaches the mergeable state, but you always can merge it manually.\n" - ) - - # FIXME: replace with something better than os.system() - git_prefix = [ - "git", - "-C", - repo_path, - "-c", - "user.email=robot-clickhouse@yandex-team.ru", - "-c", - "user.name=robot-clickhouse", - ] - base_commit_oid = self._pr["mergeCommit"]["parents"]["nodes"][0]["oid"] - - # Create separate branch for backporting, and make it look like real cherry-pick. - self._run(git_prefix + ["checkout", "-f", self.target_branch]) - self._run(git_prefix + ["checkout", "-B", self.backport_branch]) - self._run(git_prefix + ["merge", "-s", "ours", "--no-edit", base_commit_oid]) - - # Create secondary branch to allow pull request with cherry-picked commit. - self._run( - git_prefix + ["branch", "-f", self.cherrypick_branch, self.merge_commit_oid] - ) - - self._run( - git_prefix - + [ - "push", - "-f", - "origin", - "{branch}:{branch}".format(branch=self.backport_branch), - ] - ) - self._run( - git_prefix - + [ - "push", - "-f", - "origin", - "{branch}:{branch}".format(branch=self.cherrypick_branch), - ] - ) - - # Create pull-request like a local cherry-pick - pr = self._gh.create_pull_request( - source=self.cherrypick_branch, - target=self.backport_branch, - title="Cherry pick #{number} to {target}: {title}".format( - number=self._pr["number"], - target=self.target_branch, - title=self._pr["title"].replace('"', '\\"'), - ), - description="Original pull-request #{}\n\n{}".format( - self._pr["number"], DESCRIPTION - ), - ) - - # FIXME: use `team` to leave a single eligible assignee. - self._gh.add_assignee(pr, self._pr["author"]) - self._gh.add_assignee(pr, self._pr["mergedBy"]) - - self._gh.set_label(pr, "do not test") - self._gh.set_label(pr, "pr-cherrypick") - - return pr - - def mergeCherryPickPullRequest(self, cherrypick_pr): - return self._gh.merge_pull_request(cherrypick_pr["id"]) - - def getBackportPullRequest(self): - return self._gh.find_pull_request( - base=self.target_branch, head=self.backport_branch - ) - - def createBackportPullRequest(self, cherrypick_pr, repo_path): - DESCRIPTION = ( - "This pull-request is a last step of an automated backporting.\n" - "Treat it as a standard pull-request: look at the checks and resolve conflicts.\n" - "Merge it only if you intend to backport changes to the target branch, otherwise just close it.\n" - ) - - git_prefix = [ - "git", - "-C", - repo_path, - "-c", - "user.email=robot-clickhouse@clickhouse.com", - "-c", - "user.name=robot-clickhouse", - ] - - pr_title = "Backport #{number} to {target}: {title}".format( - number=self._pr["number"], - target=self.target_branch, - title=self._pr["title"].replace('"', '\\"'), - ) - - self._run(git_prefix + ["checkout", "-f", self.backport_branch]) - self._run(git_prefix + ["pull", "--ff-only", "origin", self.backport_branch]) - self._run( - git_prefix - + [ - "reset", - "--soft", - self._run( - git_prefix - + [ - "merge-base", - "origin/" + self.target_branch, - self.backport_branch, - ] - ), - ] - ) - self._run(git_prefix + ["commit", "-a", "--allow-empty", "-m", pr_title]) - self._run( - git_prefix - + [ - "push", - "-f", - "origin", - "{branch}:{branch}".format(branch=self.backport_branch), - ] - ) - - pr = self._gh.create_pull_request( - source=self.backport_branch, - target=self.target_branch, - title=pr_title, - description="Original pull-request #{}\nCherry-pick pull-request #{}\n\n{}".format( - self._pr["number"], cherrypick_pr["number"], DESCRIPTION - ), - ) - - # FIXME: use `team` to leave a single eligible assignee. - self._gh.add_assignee(pr, self._pr["author"]) - self._gh.add_assignee(pr, self._pr["mergedBy"]) - - self._gh.set_label(pr, "pr-backport") - - return pr - - def execute(self, repo_path, dry_run=False): - pr1 = self.getCherryPickPullRequest() - if not pr1: - if not dry_run: - pr1 = self.createCherryPickPullRequest(repo_path) - logging.debug( - "Created PR with cherry-pick of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr1["url"], - ) - else: - return CherryPick.Status.NOT_INITIATED - else: - logging.debug( - "Found PR with cherry-pick of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr1["url"], - ) - - if not pr1["merged"] and pr1["mergeable"] == "MERGEABLE" and not pr1["closed"]: - if not dry_run: - pr1 = self.mergeCherryPickPullRequest(pr1) - logging.debug( - "Merged PR with cherry-pick of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr1["url"], - ) - - if not pr1["merged"]: - logging.debug( - "Waiting for PR with cherry-pick of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr1["url"], - ) - - if pr1["closed"]: - return CherryPick.Status.DISCARDED - elif pr1["mergeable"] == "CONFLICTING": - return CherryPick.Status.FIRST_CONFLICTS - else: - return CherryPick.Status.FIRST_MERGEABLE - - pr2 = self.getBackportPullRequest() - if not pr2: - if not dry_run: - pr2 = self.createBackportPullRequest(pr1, repo_path) - logging.debug( - "Created PR with backport of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr2["url"], - ) - else: - return CherryPick.Status.FIRST_MERGEABLE - else: - logging.debug( - "Found PR with backport of %s to %s: %s", - self._pr["number"], - self.target_branch, - pr2["url"], - ) - - if pr2["merged"]: - return CherryPick.Status.MERGED - elif pr2["closed"]: - return CherryPick.Status.DISCARDED - elif pr2["mergeable"] == "CONFLICTING": - return CherryPick.Status.SECOND_CONFLICTS - else: - return CherryPick.Status.SECOND_MERGEABLE - - -if __name__ == "__main__": - logging.basicConfig(format="%(message)s", stream=sys.stdout, level=logging.DEBUG) - - parser = argparse.ArgumentParser() - parser.add_argument( - "--token", "-t", type=str, required=True, help="token for Github access" - ) - parser.add_argument("--pr", type=str, required=True, help="PR# to cherry-pick") - parser.add_argument( - "--branch", - "-b", - type=str, - required=True, - help="target branch name for cherry-pick", - ) - parser.add_argument( - "--repo", - "-r", - type=str, - required=True, - help="path to full repository", - metavar="PATH", - ) - args = parser.parse_args() - - cp = CherryPick( - args.token, "ClickHouse", "ClickHouse", "core", args.pr, args.branch - ) - cp.execute(args.repo) diff --git a/utils/github/local.py b/utils/github/local.py deleted file mode 100644 index 571c9102ba0..00000000000 --- a/utils/github/local.py +++ /dev/null @@ -1,108 +0,0 @@ -# -*- coding: utf-8 -*- - -import functools -import logging -import os -import re - - -class RepositoryBase: - def __init__(self, repo_path): - import git - - self._repo = git.Repo(repo_path, search_parent_directories=(not repo_path)) - - # comparator of commits - def cmp(x, y): - if str(x) == str(y): - return 0 - if self._repo.is_ancestor(x, y): - return -1 - else: - return 1 - - self.comparator = functools.cmp_to_key(cmp) - - def get_head_commit(self): - return self._repo.commit(self._default) - - def iterate(self, begin, end): - rev_range = "{}...{}".format(begin, end) - for commit in self._repo.iter_commits(rev_range, first_parent=True): - yield commit - - -class Repository(RepositoryBase): - def __init__(self, repo_path, remote_name, default_branch_name): - super(Repository, self).__init__(repo_path) - self._remote = self._repo.remotes[remote_name] - self._remote.fetch() - self._default = self._remote.refs[default_branch_name] - - def get_release_branches(self): - """ - Returns sorted list of tuples: - * remote branch (git.refs.remote.RemoteReference), - * base commit (git.Commit), - * head (git.Commit)). - List is sorted by commits in ascending order. - """ - release_branches = [] - - RE_RELEASE_BRANCH_REF = re.compile(r"^refs/remotes/.+/\d+\.\d+$") - - for branch in [ - r for r in self._remote.refs if RE_RELEASE_BRANCH_REF.match(r.path) - ]: - base = self._repo.merge_base(self._default, self._repo.commit(branch)) - if not base: - logging.info( - "Branch %s is not based on branch %s. Ignoring.", - branch.path, - self._default, - ) - elif len(base) > 1: - logging.info( - "Branch %s has more than one base commit. Ignoring.", branch.path - ) - else: - release_branches.append((os.path.basename(branch.name), base[0])) - - return sorted(release_branches, key=lambda x: self.comparator(x[1])) - - -class BareRepository(RepositoryBase): - def __init__(self, repo_path, default_branch_name): - super(BareRepository, self).__init__(repo_path) - self._default = self._repo.branches[default_branch_name] - - def get_release_branches(self): - """ - Returns sorted list of tuples: - * branch (git.refs.head?), - * base commit (git.Commit), - * head (git.Commit)). - List is sorted by commits in ascending order. - """ - release_branches = [] - - RE_RELEASE_BRANCH_REF = re.compile(r"^refs/heads/\d+\.\d+$") - - for branch in [ - r for r in self._repo.branches if RE_RELEASE_BRANCH_REF.match(r.path) - ]: - base = self._repo.merge_base(self._default, self._repo.commit(branch)) - if not base: - logging.info( - "Branch %s is not based on branch %s. Ignoring.", - branch.path, - self._default, - ) - elif len(base) > 1: - logging.info( - "Branch %s has more than one base commit. Ignoring.", branch.path - ) - else: - release_branches.append((os.path.basename(branch.name), base[0])) - - return sorted(release_branches, key=lambda x: self.comparator(x[1])) diff --git a/utils/github/parser.py b/utils/github/parser.py deleted file mode 100644 index d8348e6d964..00000000000 --- a/utils/github/parser.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- - - -class Description: - """Parsed description representation""" - - MAP_CATEGORY_TO_LABEL = { - "New Feature": "pr-feature", - "Bug Fix": "pr-bugfix", - "Improvement": "pr-improvement", - "Performance Improvement": "pr-performance", - # 'Backward Incompatible Change': doesn't match anything - "Build/Testing/Packaging Improvement": "pr-build", - "Non-significant (changelog entry is not needed)": "pr-non-significant", - "Non-significant (changelog entry is not required)": "pr-non-significant", - "Non-significant": "pr-non-significant", - "Documentation (changelog entry is not required)": "pr-documentation", - # 'Other': doesn't match anything - } - - def __init__(self, pull_request): - self.label_name = str() - self.legal = False - - self._parse(pull_request["bodyText"]) - - def _parse(self, text): - lines = text.splitlines() - next_category = False - category = str() - - for line in lines: - stripped = line.strip() - - if not stripped: - continue - - if next_category: - category = stripped - next_category = False - - if ( - stripped - == "I hereby agree to the terms of the CLA available at: https://yandex.ru/legal/cla/?lang=en" - ): - self.legal = True - - category_headers = ( - "Category (leave one):", - "Changelog category (leave one):", - "Changelog category:", - "Category:", - ) - - if stripped in category_headers: - next_category = True - - if category in Description.MAP_CATEGORY_TO_LABEL: - self.label_name = Description.MAP_CATEGORY_TO_LABEL[category] - else: - if not category: - print("Cannot find category in pr description") - else: - print(("Unknown category: " + category)) diff --git a/utils/github/query.py b/utils/github/query.py deleted file mode 100644 index 7afbc57781c..00000000000 --- a/utils/github/query.py +++ /dev/null @@ -1,492 +0,0 @@ -# -*- coding: utf-8 -*- - -import requests - - -class Query: - """ - Implements queries to the Github API using GraphQL - """ - - _PULL_REQUEST = """ - author {{ - ... on User {{ - id - login - }} - }} - - baseRepository {{ - nameWithOwner - }} - - mergeCommit {{ - oid - parents(first: {min_page_size}) {{ - totalCount - nodes {{ - oid - }} - }} - }} - - mergedBy {{ - ... on User {{ - id - login - }} - }} - - baseRefName - closed - headRefName - id - mergeable - merged - number - title - url - """ - - def __init__(self, token, owner, name, team, max_page_size=100, min_page_size=10): - self._PULL_REQUEST = Query._PULL_REQUEST.format(min_page_size=min_page_size) - - self._token = token - self._owner = owner - self._name = name - self._team = team - - self._max_page_size = max_page_size - self._min_page_size = min_page_size - - self.api_costs = {} - - repo = self.get_repository() - self._id = repo["id"] - self.ssh_url = repo["sshUrl"] - self.default_branch = repo["defaultBranchRef"]["name"] - - self.members = set(self.get_members()) - - def get_repository(self): - _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - defaultBranchRef {{ - name - }} - id - sshUrl - }} - """ - - query = _QUERY.format(owner=self._owner, name=self._name) - return self._run(query)["repository"] - - def get_members(self): - """Get all team members for organization - - Returns: - members: a map of members' logins to ids - """ - - _QUERY = """ - organization(login: "{organization}") {{ - team(slug: "{team}") {{ - members(first: {max_page_size} {next}) {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - id - login - }} - }} - }} - }} - """ - - members = {} - not_end = True - query = _QUERY.format( - organization=self._owner, - team=self._team, - max_page_size=self._max_page_size, - next="", - ) - - while not_end: - result = self._run(query)["organization"]["team"] - if result is None: - break - result = result["members"] - not_end = result["pageInfo"]["hasNextPage"] - query = _QUERY.format( - organization=self._owner, - team=self._team, - max_page_size=self._max_page_size, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), - ) - - members += dict([(node["login"], node["id"]) for node in result["nodes"]]) - - return members - - def get_pull_request(self, number): - _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequest(number: {number}) {{ - {pull_request_data} - }} - }} - """ - - query = _QUERY.format( - owner=self._owner, - name=self._name, - number=number, - pull_request_data=self._PULL_REQUEST, - min_page_size=self._min_page_size, - ) - return self._run(query)["repository"]["pullRequest"] - - def find_pull_request(self, base, head): - _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequests(first: {min_page_size} baseRefName: "{base}" headRefName: "{head}") {{ - nodes {{ - {pull_request_data} - }} - totalCount - }} - }} - """ - - query = _QUERY.format( - owner=self._owner, - name=self._name, - base=base, - head=head, - pull_request_data=self._PULL_REQUEST, - min_page_size=self._min_page_size, - ) - result = self._run(query)["repository"]["pullRequests"] - if result["totalCount"] > 0: - return result["nodes"][0] - else: - return {} - - def find_pull_requests(self, label_name): - """ - Get all pull-requests filtered by label name - """ - _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequests(first: {min_page_size} labels: "{label_name}" states: OPEN) {{ - nodes {{ - {pull_request_data} - }} - }} - }} - """ - - query = _QUERY.format( - owner=self._owner, - name=self._name, - label_name=label_name, - pull_request_data=self._PULL_REQUEST, - min_page_size=self._min_page_size, - ) - return self._run(query)["repository"]["pullRequests"]["nodes"] - - def get_pull_requests(self, before_commit): - """ - Get all merged pull-requests from the HEAD of default branch to the last commit (excluding) - """ - - _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - defaultBranchRef {{ - target {{ - ... on Commit {{ - history(first: {max_page_size} {next}) {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - oid - associatedPullRequests(first: {min_page_size}) {{ - totalCount - nodes {{ - ... on PullRequest {{ - {pull_request_data} - - labels(first: {min_page_size}) {{ - totalCount - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - name - color - }} - }} - }} - }} - }} - }} - }} - }} - }} - }} - }} - """ - - pull_requests = [] - not_end = True - query = _QUERY.format( - owner=self._owner, - name=self._name, - max_page_size=self._max_page_size, - min_page_size=self._min_page_size, - pull_request_data=self._PULL_REQUEST, - next="", - ) - - while not_end: - result = self._run(query)["repository"]["defaultBranchRef"]["target"][ - "history" - ] - not_end = result["pageInfo"]["hasNextPage"] - query = _QUERY.format( - owner=self._owner, - name=self._name, - max_page_size=self._max_page_size, - min_page_size=self._min_page_size, - pull_request_data=self._PULL_REQUEST, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), - ) - - for commit in result["nodes"]: - # FIXME: maybe include `before_commit`? - if str(commit["oid"]) == str(before_commit): - not_end = False - break - - # TODO: fetch all pull-requests that were merged in a single commit. - assert ( - commit["associatedPullRequests"]["totalCount"] - <= self._min_page_size - ) - - for pull_request in commit["associatedPullRequests"]["nodes"]: - if ( - pull_request["baseRepository"]["nameWithOwner"] - == "{}/{}".format(self._owner, self._name) - and pull_request["baseRefName"] == self.default_branch - and pull_request["mergeCommit"]["oid"] == commit["oid"] - ): - pull_requests.append(pull_request) - - return pull_requests - - def create_pull_request( - self, source, target, title, description="", draft=False, can_modify=True - ): - _QUERY = """ - createPullRequest(input: {{ - baseRefName: "{target}", - headRefName: "{source}", - repositoryId: "{id}", - title: "{title}", - body: "{body}", - draft: {draft}, - maintainerCanModify: {modify} - }}) {{ - pullRequest {{ - {pull_request_data} - }} - }} - """ - - query = _QUERY.format( - target=target, - source=source, - id=self._id, - title=title, - body=description, - draft="true" if draft else "false", - modify="true" if can_modify else "false", - pull_request_data=self._PULL_REQUEST, - ) - return self._run(query, is_mutation=True)["createPullRequest"]["pullRequest"] - - def merge_pull_request(self, id): - _QUERY = """ - mergePullRequest(input: {{ - pullRequestId: "{id}" - }}) {{ - pullRequest {{ - {pull_request_data} - }} - }} - """ - - query = _QUERY.format(id=id, pull_request_data=self._PULL_REQUEST) - return self._run(query, is_mutation=True)["mergePullRequest"]["pullRequest"] - - # FIXME: figure out how to add more assignees at once - def add_assignee(self, pr, assignee): - _QUERY = """ - addAssigneesToAssignable(input: {{ - assignableId: "{id1}", - assigneeIds: "{id2}" - }}) {{ - clientMutationId - }} - """ - - query = _QUERY.format(id1=pr["id"], id2=assignee["id"]) - self._run(query, is_mutation=True) - - def set_label(self, pull_request, label_name): - """ - Set label by name to the pull request - - Args: - pull_request: JSON object returned by `get_pull_requests()` - label_name (string): label name - """ - - _GET_LABEL = """ - repository(owner: "{owner}" name: "{name}") {{ - labels(first: {max_page_size} {next} query: "{label_name}") {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - id - name - color - }} - }} - }} - """ - - _SET_LABEL = """ - addLabelsToLabelable(input: {{ - labelableId: "{pr_id}", - labelIds: "{label_id}" - }}) {{ - clientMutationId - }} - """ - - labels = [] - not_end = True - query = _GET_LABEL.format( - owner=self._owner, - name=self._name, - label_name=label_name, - max_page_size=self._max_page_size, - next="", - ) - - while not_end: - result = self._run(query)["repository"]["labels"] - not_end = result["pageInfo"]["hasNextPage"] - query = _GET_LABEL.format( - owner=self._owner, - name=self._name, - label_name=label_name, - max_page_size=self._max_page_size, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), - ) - - labels += [label for label in result["nodes"]] - - if not labels: - return - - query = _SET_LABEL.format(pr_id=pull_request["id"], label_id=labels[0]["id"]) - self._run(query, is_mutation=True) - - def _run(self, query, is_mutation=False): - from requests.adapters import HTTPAdapter - from urllib3.util.retry import Retry - - def requests_retry_session( - retries=3, - backoff_factor=0.3, - status_forcelist=(500, 502, 504), - session=None, - ): - session = session or requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount("http://", adapter) - session.mount("https://", adapter) - return session - - headers = {"Authorization": "bearer {}".format(self._token)} - if is_mutation: - query = """ - mutation {{ - {query} - }} - """.format( - query=query - ) - else: - query = """ - query {{ - {query} - rateLimit {{ - cost - remaining - }} - }} - """.format( - query=query - ) - - while True: - request = requests_retry_session().post( - "https://api.github.com/graphql", json={"query": query}, headers=headers - ) - if request.status_code == 200: - result = request.json() - if "errors" in result: - raise Exception( - "Errors occurred: {}\nOriginal query: {}".format( - result["errors"], query - ) - ) - - if not is_mutation: - import inspect - - caller = inspect.getouterframes(inspect.currentframe(), 2)[1][3] - if caller not in list(self.api_costs.keys()): - self.api_costs[caller] = 0 - self.api_costs[caller] += result["data"]["rateLimit"]["cost"] - - return result["data"] - else: - import json - - raise Exception( - "Query failed with code {code}:\n{json}".format( - code=request.status_code, - json=json.dumps(request.json(), indent=4), - ) - ) From bf591b971a6d9dbd1b63357ac353066d6348fc34 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 15:53:08 +0200 Subject: [PATCH 26/41] Moar categories --- tests/ci/run_check.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 74e60582e8d..608a708b04b 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -89,6 +89,10 @@ MAP_CATEGORY_TO_LABEL = { "Performance Improvement": "pr-performance", "Backward Incompatible Change": "pr-backward-incompatible", "Build/Testing/Packaging Improvement": "pr-build", + "Build Improvement": "pr-build", + "Build/Testing Improvement": "pr-build", + "Build": "pr-build", + "Packaging Improvement": "pr-build", "Not for changelog (changelog entry is not required)": "pr-not-for-changelog", "Not for changelog": "pr-not-for-changelog", "Documentation (changelog entry is not required)": "pr-documentation", From 693b47cdc9c39ee4d51202592cabc915ed9b0962 Mon Sep 17 00:00:00 2001 From: tavplubix Date: Tue, 29 Mar 2022 17:51:56 +0300 Subject: [PATCH 27/41] Revert "Fix enable LLVM for JIT compilation in CMake" --- contrib/llvm-cmake/CMakeLists.txt | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/contrib/llvm-cmake/CMakeLists.txt b/contrib/llvm-cmake/CMakeLists.txt index 87c8a65510f..6ff07f0e016 100644 --- a/contrib/llvm-cmake/CMakeLists.txt +++ b/contrib/llvm-cmake/CMakeLists.txt @@ -1,9 +1,12 @@ -if (APPLE OR NOT ARCH_AMD64 OR SANITIZE STREQUAL "undefined") - set (ENABLE_EMBEDDED_COMPILER_DEFAULT OFF) +# During cross-compilation in our CI we have to use llvm-tblgen and other building tools +# tools to be build for host architecture and everything else for target architecture (e.g. AArch64) +# Possible workaround is to use llvm-tblgen from some package... +# But lets just enable LLVM for native builds +if (CMAKE_CROSSCOMPILING OR SANITIZE STREQUAL "undefined") + set (ENABLE_EMBEDDED_COMPILER_DEFAULT OFF) else() - set (ENABLE_EMBEDDED_COMPILER_DEFAULT ON) + set (ENABLE_EMBEDDED_COMPILER_DEFAULT ON) endif() - option (ENABLE_EMBEDDED_COMPILER "Enable support for 'compile_expressions' option for query execution" ${ENABLE_EMBEDDED_COMPILER_DEFAULT}) if (NOT ENABLE_EMBEDDED_COMPILER) From d59941e4f6ccd623b1f938460e6a712dd69a73d0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 19:15:25 +0200 Subject: [PATCH 28/41] Style --- tests/ci/commit_status_helper.py | 1 + tests/integration/test_allowed_url_from_config/test.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 53dfb145b44..4c79d5ace07 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -60,6 +60,7 @@ def post_commit_status_to_file(file_path, description, state, report_url): out = csv.writer(f, delimiter="\t") out.writerow([state, report_url, description]) + def post_label(gh, pr_info, label_name): repo = gh.get_repo(GITHUB_REPOSITORY) pull_request = repo.get_pull(pr_info.number) diff --git a/tests/integration/test_allowed_url_from_config/test.py b/tests/integration/test_allowed_url_from_config/test.py index 13f3929902d..01a2a500ebf 100644 --- a/tests/integration/test_allowed_url_from_config/test.py +++ b/tests/integration/test_allowed_url_from_config/test.py @@ -280,4 +280,4 @@ def test_HDFS(start_cluster): def test_schema_inference(start_cluster): error = node7.query_and_get_error("desc url('http://test.com`, 'TSVRaw'')") - assert(error.find('ReadWriteBufferFromHTTPBase') == -1) + assert error.find("ReadWriteBufferFromHTTPBase") == -1 From 9220bedb7dc57a1f0ee2be1999610fad75ea674a Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 19:28:18 +0200 Subject: [PATCH 29/41] Add submodule changed --- tests/ci/commit_status_helper.py | 5 +++-- tests/ci/pr_info.py | 9 +++++++++ tests/ci/run_check.py | 12 +++++++++--- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 4c79d5ace07..b4a3e49372f 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -61,7 +61,8 @@ def post_commit_status_to_file(file_path, description, state, report_url): out.writerow([state, report_url, description]) -def post_label(gh, pr_info, label_name): +def post_labels(gh, pr_info, labels_names): repo = gh.get_repo(GITHUB_REPOSITORY) pull_request = repo.get_pull(pr_info.number) - pull_request.add_to_labels(label_name) + for label in labels_names: + pull_request.add_to_labels(label) diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index ee4399792ae..0de4ec89124 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -236,6 +236,15 @@ class PRInfo: return True return False + def has_changes_in_submodules(self): + if self.changed_files is None or not self.changed_files: + return True + + for f in self.changed_files: + if "contrib" in f: + return True + return False + def can_skip_builds_and_use_version_from_master(self): # TODO: See a broken loop if "force tests" in self.labels: diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 608a708b04b..f0100082c71 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -8,7 +8,7 @@ from github import Github from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL from pr_info import PRInfo from get_robot_token import get_best_robot_token -from commit_status_helper import get_commit, post_label +from commit_status_helper import get_commit, post_labels NAME = "Run Check (actions)" @@ -22,6 +22,7 @@ OK_SKIP_LABELS = {"release", "pr-backport", "pr-cherrypick"} CAN_BE_TESTED_LABEL = "can be tested" DO_NOT_TEST_LABEL = "do not test" FORCE_TESTS_LABEL = "force tests" +SUBMODULE_CHANGED_LABEL = "submodule changed" # Individual trusted contirbutors who are not in any trusted organization. # Can be changed in runtime: we will append users that we learned to be in @@ -227,15 +228,20 @@ def check_pr_description(pr_info): if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - pr_info = PRInfo(need_orgs=True, pr_event_from_api=True) + pr_info = PRInfo(need_orgs=True, pr_event_from_api=True, need_changed_files=True) can_run, description, labels_state = should_run_checks_for_pr(pr_info) gh = Github(get_best_robot_token()) commit = get_commit(gh, pr_info.sha) description_report, category = check_pr_description(pr_info) + pr_labels = [] if category in MAP_CATEGORY_TO_LABEL: - post_label(gh, pr_info, MAP_CATEGORY_TO_LABEL[category]) + pr_labels.append(MAP_CATEGORY_TO_LABEL[category]) + if pr_info.has_changes_in_submodules(): + pr_labels.append(SUBMODULE_CHANGED_LABEL) + + post_labels(gh, pr_info, pr_labels) if description_report: print("::notice ::Cannot run, description does not match the template") logging.info( From 999558f00fa01605c42eacd2f85243cf74291e4c Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 19:28:45 +0200 Subject: [PATCH 30/41] Try fake update submodule --- contrib/NuRaft | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index 1707a7572aa..7c6c6961347 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 1707a7572aa66ec5d0a2dbe2bf5effa3352e6b2d +Subproject commit 7c6c696134706eeef274787c9282e40416dc03e8 From 1a8b2c9637c53bef291c0ab33866f84ef437f2e4 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 19:50:06 +0200 Subject: [PATCH 31/41] Remove submodule changed --- tests/ci/commit_status_helper.py | 7 +++++++ tests/ci/run_check.py | 12 +++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index b4a3e49372f..a53ce6715d5 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -61,6 +61,13 @@ def post_commit_status_to_file(file_path, description, state, report_url): out.writerow([state, report_url, description]) +def remove_labels(gh, pr_info, labels_names): + repo = gh.get_repo(GITHUB_REPOSITORY) + pull_request = repo.get_pull(pr_info.number) + for label in labels_names: + pull_request.remove_from_labels(label) + + def post_labels(gh, pr_info, labels_names): repo = gh.get_repo(GITHUB_REPOSITORY) pull_request = repo.get_pull(pr_info.number) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index f0100082c71..0138808c6ae 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -8,7 +8,7 @@ from github import Github from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL from pr_info import PRInfo from get_robot_token import get_best_robot_token -from commit_status_helper import get_commit, post_labels +from commit_status_helper import get_commit, post_labels, remove_labels NAME = "Run Check (actions)" @@ -235,13 +235,19 @@ if __name__ == "__main__": description_report, category = check_pr_description(pr_info) pr_labels = [] - if category in MAP_CATEGORY_TO_LABEL: + if ( + category in MAP_CATEGORY_TO_LABEL + and MAP_CATEGORY_TO_LABEL[category] not in pr_info.labels + ): pr_labels.append(MAP_CATEGORY_TO_LABEL[category]) if pr_info.has_changes_in_submodules(): pr_labels.append(SUBMODULE_CHANGED_LABEL) + elif SUBMODULE_CHANGED_LABEL in pr_info.labels: + remove_labels(gh, pr_info, [SUBMODULE_CHANGED_LABEL]) - post_labels(gh, pr_info, pr_labels) + if pr_labels: + post_labels(gh, pr_info, pr_labels) if description_report: print("::notice ::Cannot run, description does not match the template") logging.info( From 2e4ab305dd2a6951ad045b6cbdb920730710ddff Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 29 Mar 2022 19:50:13 +0200 Subject: [PATCH 32/41] Revert "Try fake update submodule" This reverts commit 999558f00fa01605c42eacd2f85243cf74291e4c. --- contrib/NuRaft | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index 7c6c6961347..1707a7572aa 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 7c6c696134706eeef274787c9282e40416dc03e8 +Subproject commit 1707a7572aa66ec5d0a2dbe2bf5effa3352e6b2d From 74cafe154a4e707d7f21ed7c39aa8c0e04e08927 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 29 Mar 2022 19:58:59 +0200 Subject: [PATCH 33/41] fix flaky test 02246_async_insert_quota --- tests/queries/0_stateless/02246_async_insert_quota.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02246_async_insert_quota.sh b/tests/queries/0_stateless/02246_async_insert_quota.sh index 9fc4df4660c..4da93f94f19 100755 --- a/tests/queries/0_stateless/02246_async_insert_quota.sh +++ b/tests/queries/0_stateless/02246_async_insert_quota.sh @@ -16,7 +16,7 @@ ${CLICKHOUSE_CLIENT} -q "CREATE ROLE r02246" ${CLICKHOUSE_CLIENT} -q "CREATE USER u02246" ${CLICKHOUSE_CLIENT} -q "GRANT INSERT ON async_inserts_02246 TO r02246" ${CLICKHOUSE_CLIENT} -q "GRANT r02246 to u02246" -${CLICKHOUSE_CLIENT} -q "CREATE QUOTA q02246 FOR INTERVAL 1 HOUR MAX QUERY INSERTS = 2 TO r02246" +${CLICKHOUSE_CLIENT} -q "CREATE QUOTA q02246 FOR INTERVAL 100 YEAR MAX QUERY INSERTS = 2 TO r02246" ${CLICKHOUSE_CLIENT} --user u02246 --async_insert 1 -q "INSERT INTO async_inserts_02246 VALUES (1, 'a')" ${CLICKHOUSE_CLIENT} --user u02246 --async_insert 1 -q "INSERT INTO async_inserts_02246 VALUES (2, 'b')" From 476c7f9d7e10e882a085a2b2a08cec43ed951718 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 29 Mar 2022 18:32:28 +0000 Subject: [PATCH 34/41] Update ci checks server. --- tests/ci/clickhouse_helper.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 7ccbcb4a47e..3731037bf4c 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -10,13 +10,13 @@ from get_robot_token import get_parameter_from_ssm class ClickHouseHelper: def __init__(self, url=None): if url is None: - self.url = get_parameter_from_ssm("clickhouse-test-stat-url2") - self.auth = { - "X-ClickHouse-User": get_parameter_from_ssm( - "clickhouse-test-stat-login2" - ), - "X-ClickHouse-Key": "", - } + self.url = get_parameter_from_ssm("clickhouse-test-stat-url") + + self.url = url + self.auth = { + 'X-ClickHouse-User': user if user is not None else get_parameter_from_ssm("clickhouse-test-stat-login"), + 'X-ClickHouse-Key': password if password is not None else get_parameter_from_ssm("clickhouse-test-stat-password") + } @staticmethod def _insert_json_str_info_impl(url, auth, db, table, json_str): From 3995a52da61cb727f9604b5d0114adc500d4bced Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 29 Mar 2022 18:36:36 +0000 Subject: [PATCH 35/41] Fix check. --- tests/ci/clickhouse_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 3731037bf4c..ea99b97c7d8 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -8,7 +8,7 @@ from get_robot_token import get_parameter_from_ssm class ClickHouseHelper: - def __init__(self, url=None): + def __init__(self, url=None, user=None, password=None): if url is None: self.url = get_parameter_from_ssm("clickhouse-test-stat-url") From bb82f77477038fa9623f40c21d81ed9899eff429 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 29 Mar 2022 18:46:48 +0000 Subject: [PATCH 36/41] Fix check. --- tests/ci/clickhouse_helper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index ea99b97c7d8..093d5541362 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -8,14 +8,14 @@ from get_robot_token import get_parameter_from_ssm class ClickHouseHelper: - def __init__(self, url=None, user=None, password=None): + def __init__(self, url=None): if url is None: - self.url = get_parameter_from_ssm("clickhouse-test-stat-url") + url = get_parameter_from_ssm("clickhouse-test-stat-url") self.url = url self.auth = { - 'X-ClickHouse-User': user if user is not None else get_parameter_from_ssm("clickhouse-test-stat-login"), - 'X-ClickHouse-Key': password if password is not None else get_parameter_from_ssm("clickhouse-test-stat-password") + 'X-ClickHouse-User': get_parameter_from_ssm("clickhouse-test-stat-login"), + 'X-ClickHouse-Key': get_parameter_from_ssm("clickhouse-test-stat-password") } @staticmethod From 081073763748106499d3279894f99705ab332700 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 29 Mar 2022 18:58:16 +0000 Subject: [PATCH 37/41] Fix style --- tests/ci/clickhouse_helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 093d5541362..6627a3974d1 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -14,8 +14,8 @@ class ClickHouseHelper: self.url = url self.auth = { - 'X-ClickHouse-User': get_parameter_from_ssm("clickhouse-test-stat-login"), - 'X-ClickHouse-Key': get_parameter_from_ssm("clickhouse-test-stat-password") + "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"), + "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password") } @staticmethod From 3849e63ab186d86a29ed0c77e1524800ff87bafd Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 29 Mar 2022 19:06:50 +0000 Subject: [PATCH 38/41] Change database --- docker/test/performance-comparison/compare.sh | 2 +- tests/ci/clickhouse_helper.py | 2 +- tests/ci/compatibility_check.py | 2 +- tests/ci/docker_images_check.py | 2 +- tests/ci/docker_manifests_merge.py | 2 +- tests/ci/docs_check.py | 2 +- tests/ci/fast_test_check.py | 2 +- tests/ci/functional_test_check.py | 2 +- tests/ci/integration_test_check.py | 2 +- tests/ci/keeper_jepsen_check.py | 2 +- tests/ci/split_build_smoke_check.py | 2 +- tests/ci/stress_check.py | 2 +- tests/ci/style_check.py | 2 +- tests/ci/unit_tests_check.py | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh index 54f71ce05bb..cdfa080a475 100755 --- a/docker/test/performance-comparison/compare.sh +++ b/docker/test/performance-comparison/compare.sh @@ -1378,7 +1378,7 @@ $REF_SHA $SHA_TO_TEST $(numactl --hardware | sed -n 's/^available:[[:space:]]\+/ EOF # Also insert some data about the check into the CI checks table. - "${client[@]}" --query "INSERT INTO "'"'"gh-data"'"'".checks FORMAT TSVWithNamesAndTypes" \ + "${client[@]}" --query "INSERT INTO "'"'"default"'"'".checks FORMAT TSVWithNamesAndTypes" \ < ci-checks.tsv set -x diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 6627a3974d1..218aaca8b91 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -179,7 +179,7 @@ def mark_flaky_tests(clickhouse_helper, check_name, test_results): check_name=check_name ) - tests_data = clickhouse_helper.select_json_each_row("gh-data", query) + tests_data = clickhouse_helper.select_json_each_row("default", query) master_failed_tests = {row["test_name"] for row in tests_data} logging.info("Found flaky tests: %s", ", ".join(master_failed_tests)) diff --git a/tests/ci/compatibility_check.py b/tests/ci/compatibility_check.py index d546fabf231..a6846257cce 100644 --- a/tests/ci/compatibility_check.py +++ b/tests/ci/compatibility_check.py @@ -197,4 +197,4 @@ if __name__ == "__main__": report_url, CHECK_NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index 3d0cc468aec..44c4623b469 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -459,7 +459,7 @@ def main(): NAME, ) ch_helper = ClickHouseHelper() - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) if __name__ == "__main__": diff --git a/tests/ci/docker_manifests_merge.py b/tests/ci/docker_manifests_merge.py index 8bd50819877..9371440346e 100644 --- a/tests/ci/docker_manifests_merge.py +++ b/tests/ci/docker_manifests_merge.py @@ -234,7 +234,7 @@ def main(): NAME, ) ch_helper = ClickHouseHelper() - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) if __name__ == "__main__": diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py index 58678b160a4..a7de66859e2 100644 --- a/tests/ci/docs_check.py +++ b/tests/ci/docs_check.py @@ -114,4 +114,4 @@ if __name__ == "__main__": report_url, NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py index 64e04594786..8c17c15b463 100644 --- a/tests/ci/fast_test_check.py +++ b/tests/ci/fast_test_check.py @@ -204,7 +204,7 @@ if __name__ == "__main__": report_url, NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) # Refuse other checks to run if fast test failed if state != "success": diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py index 52ec5a0f8e9..c5a44ba66d2 100644 --- a/tests/ci/functional_test_check.py +++ b/tests/ci/functional_test_check.py @@ -356,7 +356,7 @@ if __name__ == "__main__": report_url, check_name_with_group, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) if state != "success": if "force-tests" in pr_info.labels: diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py index 30009414d6e..636fbe664ce 100644 --- a/tests/ci/integration_test_check.py +++ b/tests/ci/integration_test_check.py @@ -279,4 +279,4 @@ if __name__ == "__main__": report_url, check_name_with_group, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/keeper_jepsen_check.py b/tests/ci/keeper_jepsen_check.py index 24d720e67ab..14c31927b75 100644 --- a/tests/ci/keeper_jepsen_check.py +++ b/tests/ci/keeper_jepsen_check.py @@ -271,5 +271,5 @@ if __name__ == "__main__": report_url, CHECK_NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) clear_autoscaling_group() diff --git a/tests/ci/split_build_smoke_check.py b/tests/ci/split_build_smoke_check.py index 41ba6c2fedb..f281bdc55a8 100644 --- a/tests/ci/split_build_smoke_check.py +++ b/tests/ci/split_build_smoke_check.py @@ -147,4 +147,4 @@ if __name__ == "__main__": report_url, CHECK_NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py index 32c181140e2..4ecdd249351 100644 --- a/tests/ci/stress_check.py +++ b/tests/ci/stress_check.py @@ -176,4 +176,4 @@ if __name__ == "__main__": report_url, check_name, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/style_check.py b/tests/ci/style_check.py index 1b3037217c8..4c210f9beab 100644 --- a/tests/ci/style_check.py +++ b/tests/ci/style_check.py @@ -117,4 +117,4 @@ if __name__ == "__main__": report_url, NAME, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) diff --git a/tests/ci/unit_tests_check.py b/tests/ci/unit_tests_check.py index 84c4faa822d..6db160d9a54 100644 --- a/tests/ci/unit_tests_check.py +++ b/tests/ci/unit_tests_check.py @@ -173,4 +173,4 @@ if __name__ == "__main__": report_url, check_name, ) - ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events) + ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) From 68ec0d92c0c5446d7a32156ae2940e78bedb582f Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 30 Mar 2022 10:58:34 +0200 Subject: [PATCH 39/41] Remove if category changed --- tests/ci/run_check.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 0138808c6ae..5f3792f3432 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -234,20 +234,29 @@ if __name__ == "__main__": commit = get_commit(gh, pr_info.sha) description_report, category = check_pr_description(pr_info) - pr_labels = [] + pr_labels_to_add = [] + pr_labels_to_remove = [] if ( category in MAP_CATEGORY_TO_LABEL and MAP_CATEGORY_TO_LABEL[category] not in pr_info.labels ): - pr_labels.append(MAP_CATEGORY_TO_LABEL[category]) + pr_labels_to_add.append(MAP_CATEGORY_TO_LABEL[category]) + + for label in pr_info.labels: + if label in MAP_CATEGORY_TO_LABEL and label not in pr_labels_to_add: + pr_labels_to_remove.append(label) if pr_info.has_changes_in_submodules(): - pr_labels.append(SUBMODULE_CHANGED_LABEL) + pr_labels_to_add.append(SUBMODULE_CHANGED_LABEL) elif SUBMODULE_CHANGED_LABEL in pr_info.labels: - remove_labels(gh, pr_info, [SUBMODULE_CHANGED_LABEL]) + pr_labels_to_remove.append(SUBMODULE_CHANGED_LABEL) + + if pr_labels_to_add: + post_labels(gh, pr_info, pr_labels_to_add) + + if pr_labels_to_remove: + remove_labels(gh, pr_info, pr_labels_to_remove) - if pr_labels: - post_labels(gh, pr_info, pr_labels) if description_report: print("::notice ::Cannot run, description does not match the template") logging.info( From bcf64a73d1c272dca22a1ca910df3153a6a66784 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 30 Mar 2022 11:00:45 +0200 Subject: [PATCH 40/41] Followup --- tests/ci/run_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 5f3792f3432..8f28287e88b 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -243,7 +243,7 @@ if __name__ == "__main__": pr_labels_to_add.append(MAP_CATEGORY_TO_LABEL[category]) for label in pr_info.labels: - if label in MAP_CATEGORY_TO_LABEL and label not in pr_labels_to_add: + if label in MAP_CATEGORY_TO_LABEL.values() and label not in pr_labels_to_add: pr_labels_to_remove.append(label) if pr_info.has_changes_in_submodules(): From c510ede8dd5b9e739be4de6b6c57aa8bbddfa22b Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 30 Mar 2022 11:19:11 +0200 Subject: [PATCH 41/41] Fixup --- tests/ci/run_check.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 8f28287e88b..93dc77124c2 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -243,7 +243,11 @@ if __name__ == "__main__": pr_labels_to_add.append(MAP_CATEGORY_TO_LABEL[category]) for label in pr_info.labels: - if label in MAP_CATEGORY_TO_LABEL.values() and label not in pr_labels_to_add: + if ( + label in MAP_CATEGORY_TO_LABEL.values() + and category in MAP_CATEGORY_TO_LABEL + and label != MAP_CATEGORY_TO_LABEL[category] + ): pr_labels_to_remove.append(label) if pr_info.has_changes_in_submodules():