From be9082f1c4a52a28e958e10f95ea7d45911bdc2b Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 21 Feb 2022 15:45:17 +0000 Subject: [PATCH 001/111] Update comments for OvercommitTracker --- src/Common/OvercommitTracker.cpp | 16 +++++++++++----- src/Common/OvercommitTracker.h | 11 +++++++++-- src/Interpreters/ProcessList.h | 9 --------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/Common/OvercommitTracker.cpp b/src/Common/OvercommitTracker.cpp index 0e70619f628..b0d60b1c25c 100644 --- a/src/Common/OvercommitTracker.cpp +++ b/src/Common/OvercommitTracker.cpp @@ -23,6 +23,12 @@ void OvercommitTracker::setMaxWaitTime(UInt64 wait_time) bool OvercommitTracker::needToStopQuery(MemoryTracker * tracker) { + // NOTE: DO NOT CHANGE THE ORDER OF LOCKS + // + // global_mutex must be acquired before overcommit_m, because + // method OvercommitTracker::unsubscribe(MemoryTracker *) is + // always called with already acquired global_mutex in + // ProcessListEntry::~ProcessListEntry(). std::unique_lock global_lock(global_mutex); std::unique_lock lk(overcommit_m); @@ -76,7 +82,7 @@ void UserOvercommitTracker::pickQueryToExcludeImpl() MemoryTracker * query_tracker = nullptr; OvercommitRatio current_ratio{0, 0}; // At this moment query list must be read only. - // BlockQueryIfMemoryLimit is used in ProcessList to guarantee this. + // This is guaranteed by locking global_mutex in OvercommitTracker::needToStopQuery. auto & queries = user_process_list->queries; LOG_DEBUG(logger, "Trying to choose query to stop from {} queries", queries.size()); for (auto const & query : queries) @@ -111,9 +117,9 @@ void GlobalOvercommitTracker::pickQueryToExcludeImpl() MemoryTracker * query_tracker = nullptr; OvercommitRatio current_ratio{0, 0}; // At this moment query list must be read only. - // BlockQueryIfMemoryLimit is used in ProcessList to guarantee this. - LOG_DEBUG(logger, "Trying to choose query to stop"); - process_list->processEachQueryStatus([&](DB::QueryStatus const & query) + // This is guaranteed by locking global_mutex in OvercommitTracker::needToStopQuery. + LOG_DEBUG(logger, "Trying to choose query to stop from {} queries", process_list->size()); + for (auto const & query : process_list->processes) { if (query.isKilled()) return; @@ -134,7 +140,7 @@ void GlobalOvercommitTracker::pickQueryToExcludeImpl() query_tracker = memory_tracker; current_ratio = ratio; } - }); + } LOG_DEBUG(logger, "Selected to stop query with overcommit ratio {}/{}", current_ratio.committed, current_ratio.soft_limit); picked_tracker = query_tracker; diff --git a/src/Common/OvercommitTracker.h b/src/Common/OvercommitTracker.h index 7c7974f0a24..f59390a8ace 100644 --- a/src/Common/OvercommitTracker.h +++ b/src/Common/OvercommitTracker.h @@ -43,8 +43,6 @@ class MemoryTracker; // is killed to free memory. struct OvercommitTracker : boost::noncopyable { - explicit OvercommitTracker(std::mutex & global_mutex_); - void setMaxWaitTime(UInt64 wait_time); bool needToStopQuery(MemoryTracker * tracker); @@ -54,8 +52,12 @@ struct OvercommitTracker : boost::noncopyable virtual ~OvercommitTracker() = default; protected: + explicit OvercommitTracker(std::mutex & global_mutex_); + virtual void pickQueryToExcludeImpl() = 0; + // This mutex is used to disallow concurrent access + // to picked_tracker and cancelation_state variables. mutable std::mutex overcommit_m; mutable std::condition_variable cv; @@ -87,6 +89,11 @@ private: } } + // Global mutex which is used in ProcessList to synchronize + // insertion and deletion of queries. + // OvercommitTracker::pickQueryToExcludeImpl() implementations + // require this mutex to be locked, because they read list (or sublist) + // of queries. std::mutex & global_mutex; }; diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 493b2ba81a9..5b68018f0ec 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -351,15 +351,6 @@ public: max_size = max_size_; } - // Before calling this method you should be sure - // that lock is acquired. - template - void processEachQueryStatus(F && func) const - { - for (auto && query : processes) - func(query); - } - void setMaxInsertQueriesAmount(size_t max_insert_queries_amount_) { std::lock_guard lock(mutex); From a5927efb64c41c7790fbbd3fbbaf1abbb953d9cf Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 22 Feb 2022 00:36:00 +0000 Subject: [PATCH 002/111] Add docs --- .../settings.md | 11 +++++++ .../operations/settings/memory-overcommit.md | 31 +++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 docs/en/operations/settings/memory-overcommit.md diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 78f6c71c65f..52f0ae6e804 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1616,3 +1616,14 @@ Possible values: Default value: `10000`. +## global_memory_usage_overcommit_max_wait_microseconds {#global_memory_usage_overcommit_max_wait_microseconds} + +Sets maximum waiting time for global overcommit tracker. + +Possible values: + +- Positive integer. + +Default value: `0`. + + diff --git a/docs/en/operations/settings/memory-overcommit.md b/docs/en/operations/settings/memory-overcommit.md new file mode 100644 index 00000000000..e36678de3df --- /dev/null +++ b/docs/en/operations/settings/memory-overcommit.md @@ -0,0 +1,31 @@ +# Memory overcommit + +Memory overcommit is an experimental technique intended to allow to set more flexible memory limits for queries. + +The idea of this technique is to introduce settings which can represent guaranteed memory amount of memory a query can use. +When memory overcommit is enabled and memory limit is reached ClickHouse will select the most overcommit query and try to free memory by killing this query. + +When memory limit is reached any query will wait some time during atempt to allocate new memory. +If selected query is killed and memory is freed within waiting timeout, query will continue execution after waiting, otherwise it'll be killed too. + +Selection of query to stop is performed by either global or user overcommit trackers depending on what memory limit is reached. + +## User overcommit tracker + +User overcommit tracker finds a query with the biggest overcommit ratio in the user's query list. +Overcommit ratio is computed as number of allocated bytes divided by value of `max_guaranteed_memory_usage` setting. + +Waiting timeout is set by `memory_usage_overcommit_max_wait_microseconds` setting. + +**Example** + +```sql +SELECT number FROM numbers(1000) GROUP BY number SETTINGS max_guaranteed_memory_usage=4000, memory_usage_overcommit_max_wait_microseconds=500 +``` + +## Global overcommit tracker + +Global overcommit tracker finds a query with the biggest overcommit ratio in the list of all queries. +In this case overcommit ratio is computed as number of allocated bytes divided by value of `max_guaranteed_memory_usage_for_user` setting. + +Waiting timeout is set by `global_memory_usage_overcommit_max_wait_microseconds` parameter in the configuration file. From 607f785e48d6a08ca975ebbd7070c8366903fbe9 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 17 Mar 2022 12:31:43 +0000 Subject: [PATCH 003/111] Revert "Merge pull request #35145 from bigo-sg/lower-column-name" This reverts commit ebf72bf61d867e7540b5c98078f5f9ae3a612fba, reversing changes made to f1b812bdc13766ffd79ab3e3a8e090a0844c83ac. --- src/Core/Settings.h | 1 - src/Formats/FormatFactory.cpp | 1 - src/Formats/FormatSettings.h | 1 - .../Formats/Impl/ArrowColumnToCHColumn.cpp | 6 ++--- .../Formats/Impl/ORCBlockInputFormat.cpp | 23 +--------------- .../Formats/Impl/ORCBlockInputFormat.h | 3 ++- .../Formats/Impl/ParquetBlockInputFormat.cpp | 26 +++---------------- .../Formats/Impl/ParquetBlockInputFormat.h | 1 - ...format_use_lowercase_column_name.reference | 6 ----- 9 files changed, 8 insertions(+), 60 deletions(-) delete mode 100644 tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.reference diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 8d28696094b..05946a1b385 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -614,7 +614,6 @@ class IColumn; M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \ M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \ M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \ - M(Bool, input_format_use_lowercase_column_name, false, "Use lowercase column name while reading input formats", 0) \ M(Bool, input_format_arrow_import_nested, false, "Allow to insert array of structs into Nested table in Arrow input format.", 0) \ M(Bool, input_format_orc_import_nested, false, "Allow to insert array of structs into Nested table in ORC input format.", 0) \ M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 08554cf7e07..3fea8d3eb7b 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -89,7 +89,6 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers; format_settings.json.quote_denormals = settings.output_format_json_quote_denormals; format_settings.null_as_default = settings.input_format_null_as_default; - format_settings.use_lowercase_column_name = settings.input_format_use_lowercase_column_name; format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size; format_settings.parquet.import_nested = settings.input_format_parquet_import_nested; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 4881c1a43c8..751b3c51fa8 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -32,7 +32,6 @@ struct FormatSettings bool null_as_default = true; bool decimal_trailing_zeros = false; bool defaults_for_omitted_fields = true; - bool use_lowercase_column_name = false; bool seekable_read = true; UInt64 max_rows_to_read_for_schema_inference = 100; diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 5c367bb69f0..ecaa485c3d6 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -32,6 +32,7 @@ #include #include + /// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn. #define FOR_ARROW_NUMERIC_TYPES(M) \ M(arrow::Type::UINT8, DB::UInt8) \ @@ -65,9 +66,9 @@ namespace ErrorCodes extern const int DUPLICATE_COLUMN; extern const int THERE_IS_NO_COLUMN; extern const int UNKNOWN_EXCEPTION; - extern const int INCORRECT_NUMBER_OF_COLUMNS; } + /// Inserts numeric data right into internal column data to reduce an overhead template > static ColumnWithTypeAndName readColumnWithNumericData(std::shared_ptr & arrow_column, const String & column_name) @@ -531,9 +532,6 @@ void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptrsecond->length(); columns_list.reserve(header.rows()); diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index aa9f7874ae8..4950e1fb952 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -1,5 +1,4 @@ #include "ORCBlockInputFormat.h" -#include #if USE_ORC #include @@ -53,9 +52,6 @@ Chunk ORCBlockInputFormat::generate() if (!table || !table->num_rows()) return res; - if (format_settings.use_lowercase_column_name) - table = *table->RenameColumns(include_column_names); - arrow_column_to_ch_column->arrowTableToCHChunk(res, table); /// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields. /// Otherwise fill the missing columns with zero values of its type. @@ -73,7 +69,6 @@ void ORCBlockInputFormat::resetParser() file_reader.reset(); include_indices.clear(); - include_column_names.clear(); block_missing_values.clear(); } @@ -125,20 +120,6 @@ static void getFileReaderAndSchema( if (!read_schema_result.ok()) throw Exception(read_schema_result.status().ToString(), ErrorCodes::BAD_ARGUMENTS); schema = std::move(read_schema_result).ValueOrDie(); - - if (format_settings.use_lowercase_column_name) - { - std::vector> fields; - fields.reserve(schema->num_fields()); - for (int i = 0; i < schema->num_fields(); ++i) - { - const auto& field = schema->field(i); - auto name = field->name(); - boost::to_lower(name); - fields.push_back(field->WithName(name)); - } - schema = arrow::schema(fields, schema->metadata()); - } } void ORCBlockInputFormat::prepareReader() @@ -167,11 +148,9 @@ void ORCBlockInputFormat::prepareReader() const auto & name = schema->field(i)->name(); if (getPort().getHeader().has(name) || nested_table_names.contains(name)) { + column_names.push_back(name); for (int j = 0; j != indexes_count; ++j) - { include_indices.push_back(index + j); - include_column_names.push_back(name); - } } index += indexes_count; } diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index bd2151d78ff..bb136d02d6e 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -45,9 +45,10 @@ private: std::unique_ptr arrow_column_to_ch_column; + std::vector column_names; + // indices of columns to read from ORC file std::vector include_indices; - std::vector include_column_names; std::vector missing_columns; BlockMissingValues block_missing_values; diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 548bf0138f5..3f0d9980573 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -1,6 +1,4 @@ #include "ParquetBlockInputFormat.h" -#include - #if USE_PARQUET #include @@ -15,6 +13,9 @@ #include "ArrowColumnToCHColumn.h" #include +#include + + namespace DB { @@ -56,9 +57,6 @@ Chunk ParquetBlockInputFormat::generate() throw ParsingException{"Error while reading Parquet data: " + read_status.ToString(), ErrorCodes::CANNOT_READ_ALL_DATA}; - if (format_settings.use_lowercase_column_name) - table = *table->RenameColumns(column_names); - ++row_group_current; arrow_column_to_ch_column->arrowTableToCHChunk(res, table); @@ -78,7 +76,6 @@ void ParquetBlockInputFormat::resetParser() file_reader.reset(); column_indices.clear(); - column_names.clear(); row_group_current = 0; block_missing_values.clear(); } @@ -123,20 +120,6 @@ static void getFileReaderAndSchema( return; THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader)); THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); - - if (format_settings.use_lowercase_column_name) - { - std::vector> fields; - fields.reserve(schema->num_fields()); - for (int i = 0; i < schema->num_fields(); ++i) - { - const auto& field = schema->field(i); - auto name = field->name(); - boost::to_lower(name); - fields.push_back(field->WithName(name)); - } - schema = arrow::schema(fields, schema->metadata()); - } } void ParquetBlockInputFormat::prepareReader() @@ -167,10 +150,7 @@ void ParquetBlockInputFormat::prepareReader() if (getPort().getHeader().has(name) || nested_table_names.contains(name)) { for (int j = 0; j != indexes_count; ++j) - { column_indices.push_back(index + j); - column_names.push_back(name); - } } index += indexes_count; } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index eba9aac29f2..1faadaa3d21 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -40,7 +40,6 @@ private: int row_group_total = 0; // indices of columns to read from Parquet file std::vector column_indices; - std::vector column_names; std::unique_ptr arrow_column_to_ch_column; int row_group_current = 0; std::vector missing_columns; diff --git a/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.reference b/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.reference deleted file mode 100644 index 5c383cb3035..00000000000 --- a/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.reference +++ /dev/null @@ -1,6 +0,0 @@ -Parquet -123 1 -456 2 -ORC -123 1 -456 2 From f75b0542554af61faad7d8fee972b8aae0bd410a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 21 Mar 2022 07:47:37 +0000 Subject: [PATCH 004/111] Allow case insensitive column matching --- src/Common/StringUtils/StringUtils.h | 9 +++ src/Core/Block.cpp | 30 +++++--- src/Core/Block.h | 14 ++-- src/Core/Settings.h | 2 + src/Formats/FormatFactory.cpp | 2 + src/Formats/FormatSettings.h | 2 + .../Formats/Impl/ArrowColumnToCHColumn.cpp | 72 ++++++++++++++----- .../Formats/Impl/ArrowColumnToCHColumn.h | 6 +- .../Formats/Impl/ORCBlockInputFormat.cpp | 27 ++++++- .../Formats/Impl/ParquetBlockInputFormat.cpp | 28 +++++++- 10 files changed, 152 insertions(+), 40 deletions(-) diff --git a/src/Common/StringUtils/StringUtils.h b/src/Common/StringUtils/StringUtils.h index 21df0f5ae8b..e1a753e816d 100644 --- a/src/Common/StringUtils/StringUtils.h +++ b/src/Common/StringUtils/StringUtils.h @@ -240,6 +240,15 @@ inline bool equalsCaseInsensitive(char a, char b) return a == b || (isAlphaASCII(a) && alternateCaseIfAlphaASCII(a) == b); } +inline bool equalsCaseInsensitive(const std::string_view a, const std::string_view b) +{ + if (a.length() != b.length()) + return false; + + return std::equal( + a.begin(), a.end(), b.begin(), [](const auto first, const auto second) { return equalsCaseInsensitive(first, second); }); +} + template std::string trim(const std::string & str, F && predicate) diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 5c93d6719fa..306f99d7c24 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -269,8 +269,18 @@ const ColumnWithTypeAndName & Block::safeGetByPosition(size_t position) const } -const ColumnWithTypeAndName * Block::findByName(const std::string & name) const +const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool case_insensitive) const { + if (case_insensitive) + { + auto found = std::find_if(data.begin(), data.end(), [&](const auto & column) { return equalsCaseInsensitive(column.name, name); }); + if (found == data.end()) + { + return nullptr; + } + return &*found; + } + auto it = index_by_name.find(name); if (index_by_name.end() == it) { @@ -280,19 +290,23 @@ const ColumnWithTypeAndName * Block::findByName(const std::string & name) const } -const ColumnWithTypeAndName & Block::getByName(const std::string & name) const +const ColumnWithTypeAndName & Block::getByName(const std::string & name, bool case_insensitive) const { - const auto * result = findByName(name); + const auto * result = findByName(name, case_insensitive); if (!result) - throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames() - , ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + throw Exception( + "Not found column " + name + " in block. There are only columns: " + dumpNames(), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); return *result; } -bool Block::has(const std::string & name) const +bool Block::has(const std::string & name, bool case_insensitive) const { + if (case_insensitive) + return std::find_if(data.begin(), data.end(), [&](const auto & column) { return equalsCaseInsensitive(column.name, name); }) + != data.end(); + return index_by_name.end() != index_by_name.find(name); } @@ -301,8 +315,8 @@ size_t Block::getPositionByName(const std::string & name) const { auto it = index_by_name.find(name); if (index_by_name.end() == it) - throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames() - , ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + throw Exception( + "Not found column " + name + " in block. There are only columns: " + dumpNames(), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); return it->second; } diff --git a/src/Core/Block.h b/src/Core/Block.h index 66e16b70f47..c5d3e1ae35a 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -60,21 +60,21 @@ public: ColumnWithTypeAndName & safeGetByPosition(size_t position); const ColumnWithTypeAndName & safeGetByPosition(size_t position) const; - ColumnWithTypeAndName* findByName(const std::string & name) + ColumnWithTypeAndName* findByName(const std::string & name, bool case_insensitive = false) { return const_cast( - const_cast(this)->findByName(name)); + const_cast(this)->findByName(name, case_insensitive)); } - const ColumnWithTypeAndName * findByName(const std::string & name) const; + const ColumnWithTypeAndName * findByName(const std::string & name, bool case_insensitive = false) const; - ColumnWithTypeAndName & getByName(const std::string & name) + ColumnWithTypeAndName & getByName(const std::string & name, bool case_insensitive = false) { return const_cast( - const_cast(this)->getByName(name)); + const_cast(this)->getByName(name, case_insensitive)); } - const ColumnWithTypeAndName & getByName(const std::string & name) const; + const ColumnWithTypeAndName & getByName(const std::string & name, bool case_insensitive = false) const; Container::iterator begin() { return data.begin(); } Container::iterator end() { return data.end(); } @@ -83,7 +83,7 @@ public: Container::const_iterator cbegin() const { return data.cbegin(); } Container::const_iterator cend() const { return data.cend(); } - bool has(const std::string & name) const; + bool has(const std::string & name, bool case_insensitive = false) const; size_t getPositionByName(const std::string & name) const; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 05946a1b385..6d78b5d71ac 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -617,7 +617,9 @@ class IColumn; M(Bool, input_format_arrow_import_nested, false, "Allow to insert array of structs into Nested table in Arrow input format.", 0) \ M(Bool, input_format_orc_import_nested, false, "Allow to insert array of structs into Nested table in ORC input format.", 0) \ M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \ + M(Bool, input_format_orc_case_insensitive_column_matching, false, "Ignore case when matching ORC columns with CH columns.", 0) \ M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \ + M(Bool, input_format_parquet_case_insensitive_column_matching, false, "Ignore case when matching Parquet columns with CH columns.", 0) \ M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \ M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \ M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 3fea8d3eb7b..8c25fef53cb 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -92,6 +92,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size; format_settings.parquet.import_nested = settings.input_format_parquet_import_nested; + format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching; format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns; format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8; format_settings.pretty.color = settings.output_format_pretty_color; @@ -125,6 +126,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; + format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.seekable_read = settings.input_format_allow_seeks; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 751b3c51fa8..de05dda9138 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -136,6 +136,7 @@ struct FormatSettings UInt64 row_group_size = 1000000; bool import_nested = false; bool allow_missing_columns = false; + bool case_insensitive_column_matching = false; } parquet; struct Pretty @@ -216,6 +217,7 @@ struct FormatSettings bool import_nested = false; bool allow_missing_columns = false; int64_t row_batch_size = 100'000; + bool case_insensitive_column_matching = false; } orc; /// For capnProto format we should determine how to diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index ecaa485c3d6..eaf4e6be6cc 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -1,5 +1,8 @@ #include "ArrowColumnToCHColumn.h" +#include +#include + #if USE_ARROW || USE_ORC || USE_PARQUET #include @@ -66,6 +69,7 @@ namespace ErrorCodes extern const int DUPLICATE_COLUMN; extern const int THERE_IS_NO_COLUMN; extern const int UNKNOWN_EXCEPTION; + extern const int INCORRECT_NUMBER_OF_COLUMNS; } @@ -485,7 +489,7 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } -Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name) +Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, bool lowercase_names) { ColumnsWithTypeAndName sample_columns; for (const auto & field : schema.fields()) @@ -505,26 +509,36 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, std::unordered_map> dict_values; ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); + if (lowercase_names) + { + boost::to_lower(sample_column.name); + } + sample_columns.emplace_back(std::move(sample_column)); } return Block(std::move(sample_columns)); } ArrowColumnToCHColumn::ArrowColumnToCHColumn( - const Block & header_, const std::string & format_name_, bool import_nested_, bool allow_missing_columns_) - : header(header_), format_name(format_name_), import_nested(import_nested_), allow_missing_columns(allow_missing_columns_) + const Block & header_, const std::string & format_name_, bool import_nested_, bool allow_missing_columns_, bool case_insensitive_matching_) + : header(header_), format_name(format_name_), import_nested(import_nested_), allow_missing_columns(allow_missing_columns_), case_insensitive_matching(case_insensitive_matching_) { } void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr & table) { NameToColumnPtr name_to_column_ptr; - for (const auto & column_name : table->ColumnNames()) + for (auto column_name : table->ColumnNames()) { std::shared_ptr arrow_column = table->GetColumnByName(column_name); if (!arrow_column) throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Column '{}' is duplicated", column_name); - name_to_column_ptr[column_name] = arrow_column; + + if (case_insensitive_matching) + { + boost::to_lower(column_name); + } + name_to_column_ptr[std::move(column_name)] = arrow_column; } arrowColumnsToCHChunk(res, name_to_column_ptr); @@ -532,6 +546,8 @@ void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptrsecond->length(); columns_list.reserve(header.rows()); @@ -540,22 +556,33 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & { const ColumnWithTypeAndName & header_column = header.getByPosition(column_i); + auto search_column_name = header_column.name; + if (case_insensitive_matching) + { + boost::to_lower(search_column_name); + } + bool read_from_nested = false; String nested_table_name = Nested::extractTableName(header_column.name); - if (!name_to_column_ptr.contains(header_column.name)) + String search_nested_table_name = nested_table_name; + if (case_insensitive_matching) + { + boost::to_lower(search_nested_table_name); + } + if (!name_to_column_ptr.contains(search_column_name)) { /// Check if it's a column from nested table. - if (import_nested && name_to_column_ptr.contains(nested_table_name)) + if (import_nested && name_to_column_ptr.contains(search_nested_table_name)) { - if (!nested_tables.contains(nested_table_name)) + if (!nested_tables.contains(search_nested_table_name)) { - std::shared_ptr arrow_column = name_to_column_ptr[nested_table_name]; + std::shared_ptr arrow_column = name_to_column_ptr[search_nested_table_name]; ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; Block block(cols); - nested_tables[nested_table_name] = std::make_shared(Nested::flatten(block)); + nested_tables[search_nested_table_name] = std::make_shared(Nested::flatten(block)); } - read_from_nested = nested_tables[nested_table_name]->has(header_column.name); + read_from_nested = nested_tables[search_nested_table_name]->has(header_column.name, case_insensitive_matching); } if (!read_from_nested) @@ -572,11 +599,17 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & } } - std::shared_ptr arrow_column = name_to_column_ptr[header_column.name]; + std::shared_ptr arrow_column = name_to_column_ptr[search_column_name]; ColumnWithTypeAndName column; if (read_from_nested) - column = nested_tables[nested_table_name]->getByName(header_column.name); + { + column = nested_tables[search_nested_table_name]->getByName(header_column.name, case_insensitive_matching); + if (case_insensitive_matching) + { + column.name = header_column.name; + } + } else column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true); @@ -605,18 +638,19 @@ std::vector ArrowColumnToCHColumn::getMissingColumns(const arrow::Schema auto flatten_block_from_arrow = Nested::flatten(block_from_arrow); for (size_t i = 0, columns = header.columns(); i < columns; ++i) { - const auto & column = header.getByPosition(i); + const auto & header_column = header.getByPosition(i); + auto column_name = header_column.name; bool read_from_nested = false; - String nested_table_name = Nested::extractTableName(column.name); - if (!block_from_arrow.has(column.name)) + String nested_table_name = Nested::extractTableName(column_name); + if (!block_from_arrow.has(column_name, case_insensitive_matching)) { - if (import_nested && block_from_arrow.has(nested_table_name)) - read_from_nested = flatten_block_from_arrow.has(column.name); + if (import_nested && block_from_arrow.has(nested_table_name, case_insensitive_matching)) + read_from_nested = flatten_block_from_arrow.has(column_name, case_insensitive_matching); if (!read_from_nested) { if (!allow_missing_columns) - throw Exception{ErrorCodes::THERE_IS_NO_COLUMN, "Column '{}' is not presented in input data.", column.name}; + throw Exception{ErrorCodes::THERE_IS_NO_COLUMN, "Column '{}' is not presented in input data.", header_column.name}; missing_columns.push_back(i); } diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index 07e7fb36404..64918d03904 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -25,7 +25,8 @@ public: const Block & header_, const std::string & format_name_, bool import_nested_, - bool allow_missing_columns_); + bool allow_missing_columns_, + bool case_insensitive_matching_ = false); void arrowTableToCHChunk(Chunk & res, std::shared_ptr & table); @@ -34,7 +35,7 @@ public: /// Get missing columns that exists in header but not in arrow::Schema std::vector getMissingColumns(const arrow::Schema & schema) const; - static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name); + static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, bool lowercase_names = false); private: const Block & header; @@ -42,6 +43,7 @@ private: bool import_nested; /// If false, throw exception if some columns in header not exists in arrow table. bool allow_missing_columns; + bool case_insensitive_matching; /// Map {column name : dictionary column}. /// To avoid converting dictionary from Arrow Dictionary diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index 4950e1fb952..cb6d4a19d20 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -1,4 +1,5 @@ #include "ORCBlockInputFormat.h" +#include "Common/StringUtils/StringUtils.h" #if USE_ORC #include @@ -130,7 +131,7 @@ void ORCBlockInputFormat::prepareReader() return; arrow_column_to_ch_column = std::make_unique( - getPort().getHeader(), "ORC", format_settings.orc.import_nested, format_settings.orc.allow_missing_columns); + getPort().getHeader(), "ORC", format_settings.orc.import_nested, format_settings.orc.allow_missing_columns, format_settings.orc.case_insensitive_column_matching); missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema); std::unordered_set nested_table_names; @@ -146,12 +147,34 @@ void ORCBlockInputFormat::prepareReader() /// so we should recursively count the number of indices we need for this type. int indexes_count = countIndicesForType(schema->field(i)->type()); const auto & name = schema->field(i)->name(); - if (getPort().getHeader().has(name) || nested_table_names.contains(name)) + const bool contains_column = std::invoke([&] + { + if (getPort().getHeader().has(name, format_settings.parquet.case_insensitive_column_matching)) + { + return true; + } + + if (!format_settings.parquet.case_insensitive_column_matching) + { + return nested_table_names.contains(name); + } + + return std::find_if( + nested_table_names.begin(), + nested_table_names.end(), + [&](const auto & nested_table_name) + { + return equalsCaseInsensitive(nested_table_name, name); + }) != nested_table_names.end(); + }); + + if (contains_column) { column_names.push_back(name); for (int j = 0; j != indexes_count; ++j) include_indices.push_back(index + j); } + index += indexes_count; } } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 3f0d9980573..0c39ca0498b 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -1,4 +1,5 @@ #include "ParquetBlockInputFormat.h" +#include "Common/StringUtils/StringUtils.h" #if USE_PARQUET #include @@ -132,7 +133,7 @@ void ParquetBlockInputFormat::prepareReader() row_group_total = file_reader->num_row_groups(); row_group_current = 0; - arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested, format_settings.parquet.allow_missing_columns); + arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested, format_settings.parquet.allow_missing_columns, format_settings.parquet.case_insensitive_column_matching); missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema); std::unordered_set nested_table_names; @@ -147,11 +148,34 @@ void ParquetBlockInputFormat::prepareReader() /// count the number of indices we need for this type. int indexes_count = countIndicesForType(schema->field(i)->type()); const auto & name = schema->field(i)->name(); - if (getPort().getHeader().has(name) || nested_table_names.contains(name)) + + const bool contains_column = std::invoke([&] + { + if (getPort().getHeader().has(name, format_settings.parquet.case_insensitive_column_matching)) + { + return true; + } + + if (!format_settings.parquet.case_insensitive_column_matching) + { + return nested_table_names.contains(name); + } + + return std::find_if( + nested_table_names.begin(), + nested_table_names.end(), + [&](const auto & nested_table_name) + { + return equalsCaseInsensitive(nested_table_name, name); + }) != nested_table_names.end(); + }); + + if (contains_column) { for (int j = 0; j != indexes_count; ++j) column_indices.push_back(index + j); } + index += indexes_count; } } From d73c906e68efe1214e7e84c9cbc10f346cc96b74 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 21 Mar 2022 07:50:17 +0000 Subject: [PATCH 005/111] Format code --- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 157 ++++++++++-------- .../Formats/Impl/ArrowColumnToCHColumn.h | 8 +- .../Formats/Impl/ORCBlockInputFormat.cpp | 89 +++++----- .../Formats/Impl/ParquetBlockInputFormat.cpp | 105 ++++++------ 4 files changed, 186 insertions(+), 173 deletions(-) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index eaf4e6be6cc..02eaa3ce952 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -1,43 +1,43 @@ #include "ArrowColumnToCHColumn.h" -#include #include +#include #if USE_ARROW || USE_ORC || USE_PARQUET -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include /// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn. -#define FOR_ARROW_NUMERIC_TYPES(M) \ +# define FOR_ARROW_NUMERIC_TYPES(M) \ M(arrow::Type::UINT8, DB::UInt8) \ M(arrow::Type::INT8, DB::Int8) \ M(arrow::Type::INT16, DB::Int16) \ @@ -48,7 +48,7 @@ M(arrow::Type::FLOAT, DB::Float32) \ M(arrow::Type::DOUBLE, DB::Float64) -#define FOR_ARROW_INDEXES_TYPES(M) \ +# define FOR_ARROW_INDEXES_TYPES(M) \ M(arrow::Type::UINT8, DB::UInt8) \ M(arrow::Type::INT8, DB::UInt8) \ M(arrow::Type::UINT16, DB::UInt16) \ @@ -180,8 +180,12 @@ static ColumnWithTypeAndName readColumnWithDate32Data(std::shared_ptr(chunk.Value(value_i)); if (days_num > DATE_LUT_MAX_EXTEND_DAY_NUM) - throw Exception{ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, - "Input value {} of a column \"{}\" is greater than max allowed Date value, which is {}", days_num, column_name, DATE_LUT_MAX_DAY_NUM}; + throw Exception{ + ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, + "Input value {} of a column \"{}\" is greater than max allowed Date value, which is {}", + days_num, + column_name, + DATE_LUT_MAX_DAY_NUM}; column_data.emplace_back(days_num); } @@ -230,7 +234,8 @@ static ColumnWithTypeAndName readColumnWithTimestampData(std::shared_ptr -static ColumnWithTypeAndName readColumnWithDecimalDataImpl(std::shared_ptr & arrow_column, const String & column_name, DataTypePtr internal_type) +static ColumnWithTypeAndName +readColumnWithDecimalDataImpl(std::shared_ptr & arrow_column, const String & column_name, DataTypePtr internal_type) { auto internal_column = internal_type->createColumn(); auto & column = assert_cast &>(*internal_column); @@ -242,7 +247,8 @@ static ColumnWithTypeAndName readColumnWithDecimalDataImpl(std::shared_ptr(*(arrow_column->chunk(chunk_i))); for (size_t value_i = 0, length = static_cast(chunk.length()); value_i < length; ++value_i) { - column_data.emplace_back(chunk.IsNull(value_i) ? DecimalType(0) : *reinterpret_cast(chunk.Value(value_i))); // TODO: copy column + column_data.emplace_back( + chunk.IsNull(value_i) ? DecimalType(0) : *reinterpret_cast(chunk.Value(value_i))); // TODO: copy column } } return {std::move(internal_column), internal_type, column_name}; @@ -303,10 +309,9 @@ static ColumnPtr readColumnWithIndexesData(std::shared_ptr switch (arrow_column->type()->id()) { # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ - case ARROW_NUMERIC_TYPE: \ - { \ - return readColumnWithNumericData(arrow_column, "").column; \ - } + case ARROW_NUMERIC_TYPE: { \ + return readColumnWithNumericData(arrow_column, "").column; \ + } FOR_ARROW_INDEXES_TYPES(DISPATCH) # undef DISPATCH default: @@ -360,15 +365,13 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( // ClickHouse writes Date as arrow UINT16 and DateTime as arrow UINT32, // so, read UINT16 as Date and UINT32 as DateTime to perform correct conversion // between Date and DateTime further. - case arrow::Type::UINT16: - { + case arrow::Type::UINT16: { auto column = readColumnWithNumericData(arrow_column, column_name); if (read_ints_as_dates) column.type = std::make_shared(); return column; } - case arrow::Type::UINT32: - { + case arrow::Type::UINT32: { auto column = readColumnWithNumericData(arrow_column, column_name); if (read_ints_as_dates) column.type = std::make_shared(); @@ -380,10 +383,10 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::DECIMAL256: return readColumnWithDecimalData(arrow_column, column_name); - case arrow::Type::MAP: - { + case arrow::Type::MAP: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto nested_column + = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); const auto * tuple_column = assert_cast(nested_column.column.get()); @@ -392,17 +395,16 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( auto map_type = std::make_shared(tuple_type->getElements()[0], tuple_type->getElements()[1]); return {std::move(map_column), std::move(map_type), column_name}; } - case arrow::Type::LIST: - { + case arrow::Type::LIST: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto nested_column + = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); auto array_column = ColumnArray::create(nested_column.column, offsets_column); auto array_type = std::make_shared(nested_column.type); return {std::move(array_column), std::move(array_type), column_name}; } - case arrow::Type::STRUCT: - { + case arrow::Type::STRUCT: { auto arrow_type = arrow_column->type(); auto * arrow_struct_type = assert_cast(arrow_type.get()); std::vector nested_arrow_columns(arrow_struct_type->num_fields()); @@ -420,7 +422,8 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( for (int i = 0; i != arrow_struct_type->num_fields(); ++i) { auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); - auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates); + auto element = readColumnFromArrowColumn( + nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates); tuple_elements.emplace_back(std::move(element.column)); tuple_types.emplace_back(std::move(element.type)); tuple_names.emplace_back(std::move(element.name)); @@ -430,8 +433,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( auto tuple_type = std::make_shared(std::move(tuple_types), std::move(tuple_names)); return {std::move(tuple_column), std::move(tuple_type), column_name}; } - case arrow::Type::DICTIONARY: - { + case arrow::Type::DICTIONARY: { auto & dict_values = dictionary_values[column_name]; /// Load dictionary values only once and reuse it. if (!dict_values) @@ -443,12 +445,14 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( dict_array.emplace_back(dict_chunk.dictionary()); } auto arrow_dict_column = std::make_shared(dict_array); - auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto dict_column + = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); /// We should convert read column to ColumnUnique. auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn(); auto tmp_dict_column = IColumn::mutate(assert_cast(tmp_lc_column.get())->getDictionaryPtr()); - static_cast(tmp_dict_column.get())->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size()); + static_cast(tmp_dict_column.get()) + ->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size()); dict_column.column = std::move(tmp_dict_column); dict_values = std::make_shared(std::move(dict_column)); } @@ -469,13 +473,17 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ case ARROW_NUMERIC_TYPE: \ return readColumnWithNumericData(arrow_column, column_name); - FOR_ARROW_NUMERIC_TYPES(DISPATCH) + FOR_ARROW_NUMERIC_TYPES(DISPATCH) # undef DISPATCH // TODO: read JSON as a string? // TODO: read UUID as a string? default: - throw Exception(ErrorCodes::UNKNOWN_TYPE, - "Unsupported {} type '{}' of an input column '{}'.", format_name, arrow_column->type()->name(), column_name); + throw Exception( + ErrorCodes::UNKNOWN_TYPE, + "Unsupported {} type '{}' of an input column '{}'.", + format_name, + arrow_column->type()->name(), + column_name); } } @@ -495,7 +503,7 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, for (const auto & field : schema.fields()) { /// Create empty arrow column by it's type and convert it to ClickHouse column. - arrow::MemoryPool* pool = arrow::default_memory_pool(); + arrow::MemoryPool * pool = arrow::default_memory_pool(); std::unique_ptr array_builder; arrow::Status status = MakeBuilder(pool, field->type(), &array_builder); checkStatus(status, field->name(), format_name); @@ -507,7 +515,8 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, arrow::ArrayVector array_vector = {arrow_array}; auto arrow_column = std::make_shared(array_vector); std::unordered_map> dict_values; - ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); + ColumnWithTypeAndName sample_column + = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); if (lowercase_names) { @@ -520,8 +529,16 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, } ArrowColumnToCHColumn::ArrowColumnToCHColumn( - const Block & header_, const std::string & format_name_, bool import_nested_, bool allow_missing_columns_, bool case_insensitive_matching_) - : header(header_), format_name(format_name_), import_nested(import_nested_), allow_missing_columns(allow_missing_columns_), case_insensitive_matching(case_insensitive_matching_) + const Block & header_, + const std::string & format_name_, + bool import_nested_, + bool allow_missing_columns_, + bool case_insensitive_matching_) + : header(header_) + , format_name(format_name_) + , import_nested(import_nested_) + , allow_missing_columns(allow_missing_columns_) + , case_insensitive_matching(case_insensitive_matching_) { } @@ -567,7 +584,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & String search_nested_table_name = nested_table_name; if (case_insensitive_matching) { - boost::to_lower(search_nested_table_name); + boost::to_lower(search_nested_table_name); } if (!name_to_column_ptr.contains(search_column_name)) { @@ -577,7 +594,8 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (!nested_tables.contains(search_nested_table_name)) { std::shared_ptr arrow_column = name_to_column_ptr[search_nested_table_name]; - ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; + ColumnsWithTypeAndName cols + = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; Block block(cols); nested_tables[search_nested_table_name] = std::make_shared(Nested::flatten(block)); } @@ -619,8 +637,11 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & } catch (Exception & e) { - e.addMessage(fmt::format("while converting column {} from type {} to type {}", - backQuote(header_column.name), column.type->getName(), header_column.type->getName())); + e.addMessage(fmt::format( + "while converting column {} from type {} to type {}", + backQuote(header_column.name), + column.type->getName(), + header_column.type->getName())); throw; } diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index 64918d03904..38887f06303 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -4,10 +4,10 @@ #if USE_ARROW || USE_ORC || USE_PARQUET -#include -#include -#include -#include +# include +# include +# include +# include namespace DB diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index cb6d4a19d20..1eab922c397 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -2,13 +2,13 @@ #include "Common/StringUtils/StringUtils.h" #if USE_ORC -#include -#include -#include -#include -#include "ArrowBufferedStreams.h" -#include "ArrowColumnToCHColumn.h" -#include +# include +# include +# include +# include +# include +# include "ArrowBufferedStreams.h" +# include "ArrowColumnToCHColumn.h" namespace DB { @@ -131,7 +131,11 @@ void ORCBlockInputFormat::prepareReader() return; arrow_column_to_ch_column = std::make_unique( - getPort().getHeader(), "ORC", format_settings.orc.import_nested, format_settings.orc.allow_missing_columns, format_settings.orc.case_insensitive_column_matching); + getPort().getHeader(), + "ORC", + format_settings.orc.import_nested, + format_settings.orc.allow_missing_columns, + format_settings.orc.case_insensitive_column_matching); missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema); std::unordered_set nested_table_names; @@ -147,26 +151,25 @@ void ORCBlockInputFormat::prepareReader() /// so we should recursively count the number of indices we need for this type. int indexes_count = countIndicesForType(schema->field(i)->type()); const auto & name = schema->field(i)->name(); - const bool contains_column = std::invoke([&] - { - if (getPort().getHeader().has(name, format_settings.parquet.case_insensitive_column_matching)) + const bool contains_column = std::invoke( + [&] { - return true; - } - - if (!format_settings.parquet.case_insensitive_column_matching) - { - return nested_table_names.contains(name); - } - - return std::find_if( - nested_table_names.begin(), - nested_table_names.end(), - [&](const auto & nested_table_name) + if (getPort().getHeader().has(name, format_settings.parquet.case_insensitive_column_matching)) { - return equalsCaseInsensitive(nested_table_name, name); - }) != nested_table_names.end(); - }); + return true; + } + + if (!format_settings.parquet.case_insensitive_column_matching) + { + return nested_table_names.contains(name); + } + + return std::find_if( + nested_table_names.begin(), + nested_table_names.end(), + [&](const auto & nested_table_name) { return equalsCaseInsensitive(nested_table_name, name); }) + != nested_table_names.end(); + }); if (contains_column) { @@ -179,7 +182,8 @@ void ORCBlockInputFormat::prepareReader() } } -ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : ISchemaReader(in_), format_settings(format_settings_) { } @@ -196,14 +200,9 @@ NamesAndTypesList ORCSchemaReader::readSchema() void registerInputFormatORC(FormatFactory & factory) { factory.registerInputFormat( - "ORC", - [](ReadBuffer &buf, - const Block &sample, - const RowInputFormatParams &, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, settings); - }); + "ORC", + [](ReadBuffer & buf, const Block & sample, const RowInputFormatParams &, const FormatSettings & settings) + { return std::make_shared(buf, sample, settings); }); factory.markFormatAsColumnOriented("ORC"); } @@ -211,11 +210,7 @@ void registerORCSchemaReader(FormatFactory & factory) { factory.registerSchemaReader( "ORC", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) - { - return std::make_shared(buf, settings); - } - ); + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) { return std::make_shared(buf, settings); }); } } @@ -223,14 +218,14 @@ void registerORCSchemaReader(FormatFactory & factory) namespace DB { - class FormatFactory; - void registerInputFormatORC(FormatFactory &) - { - } +class FormatFactory; +void registerInputFormatORC(FormatFactory &) +{ +} - void registerORCSchemaReader(FormatFactory &) - { - } +void registerORCSchemaReader(FormatFactory &) +{ +} } #endif diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 0c39ca0498b..c2e3c71d671 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -2,19 +2,19 @@ #include "Common/StringUtils/StringUtils.h" #if USE_PARQUET -#include -#include -#include -#include -#include -#include -#include -#include -#include "ArrowBufferedStreams.h" -#include "ArrowColumnToCHColumn.h" -#include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include "ArrowBufferedStreams.h" +# include "ArrowColumnToCHColumn.h" -#include +# include namespace DB @@ -26,12 +26,12 @@ namespace ErrorCodes extern const int CANNOT_READ_ALL_DATA; } -#define THROW_ARROW_NOT_OK(status) \ - do \ - { \ - if (::arrow::Status _s = (status); !_s.ok()) \ - throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \ - } while (false) +# define THROW_ARROW_NOT_OK(status) \ + do \ + { \ + if (::arrow::Status _s = (status); !_s.ok()) \ + throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \ + } while (false) ParquetBlockInputFormat::ParquetBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) : IInputFormat(std::move(header_), in_), format_settings(format_settings_) @@ -55,8 +55,7 @@ Chunk ParquetBlockInputFormat::generate() std::shared_ptr table; arrow::Status read_status = file_reader->ReadRowGroup(row_group_current, column_indices, &table); if (!read_status.ok()) - throw ParsingException{"Error while reading Parquet data: " + read_status.ToString(), - ErrorCodes::CANNOT_READ_ALL_DATA}; + throw ParsingException{"Error while reading Parquet data: " + read_status.ToString(), ErrorCodes::CANNOT_READ_ALL_DATA}; ++row_group_current; @@ -133,7 +132,12 @@ void ParquetBlockInputFormat::prepareReader() row_group_total = file_reader->num_row_groups(); row_group_current = 0; - arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested, format_settings.parquet.allow_missing_columns, format_settings.parquet.case_insensitive_column_matching); + arrow_column_to_ch_column = std::make_unique( + getPort().getHeader(), + "Parquet", + format_settings.parquet.import_nested, + format_settings.parquet.allow_missing_columns, + format_settings.parquet.case_insensitive_column_matching); missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema); std::unordered_set nested_table_names; @@ -149,26 +153,25 @@ void ParquetBlockInputFormat::prepareReader() int indexes_count = countIndicesForType(schema->field(i)->type()); const auto & name = schema->field(i)->name(); - const bool contains_column = std::invoke([&] - { - if (getPort().getHeader().has(name, format_settings.parquet.case_insensitive_column_matching)) + const bool contains_column = std::invoke( + [&] { - return true; - } - - if (!format_settings.parquet.case_insensitive_column_matching) - { - return nested_table_names.contains(name); - } - - return std::find_if( - nested_table_names.begin(), - nested_table_names.end(), - [&](const auto & nested_table_name) + if (getPort().getHeader().has(name, format_settings.parquet.case_insensitive_column_matching)) { - return equalsCaseInsensitive(nested_table_name, name); - }) != nested_table_names.end(); - }); + return true; + } + + if (!format_settings.parquet.case_insensitive_column_matching) + { + return nested_table_names.contains(name); + } + + return std::find_if( + nested_table_names.begin(), + nested_table_names.end(), + [&](const auto & nested_table_name) { return equalsCaseInsensitive(nested_table_name, name); }) + != nested_table_names.end(); + }); if (contains_column) { @@ -180,7 +183,8 @@ void ParquetBlockInputFormat::prepareReader() } } -ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : ISchemaReader(in_), format_settings(format_settings_) { } @@ -197,14 +201,9 @@ NamesAndTypesList ParquetSchemaReader::readSchema() void registerInputFormatParquet(FormatFactory & factory) { factory.registerInputFormat( - "Parquet", - [](ReadBuffer &buf, - const Block &sample, - const RowInputFormatParams &, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, settings); - }); + "Parquet", + [](ReadBuffer & buf, const Block & sample, const RowInputFormatParams &, const FormatSettings & settings) + { return std::make_shared(buf, sample, settings); }); factory.markFormatAsColumnOriented("Parquet"); } @@ -212,11 +211,7 @@ void registerParquetSchemaReader(FormatFactory & factory) { factory.registerSchemaReader( "Parquet", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) - { - return std::make_shared(buf, settings); - } - ); + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) { return std::make_shared(buf, settings); }); } } @@ -230,7 +225,9 @@ void registerInputFormatParquet(FormatFactory &) { } -void registerParquetSchemaReader(FormatFactory &) {} +void registerParquetSchemaReader(FormatFactory &) +{ +} } #endif From 0c74fa2c1936af2eec5bead599cffef6c25691aa Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 21 Mar 2022 08:38:15 +0000 Subject: [PATCH 006/111] Remove unecessary code --- src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp | 7 +------ src/Processors/Formats/Impl/ArrowColumnToCHColumn.h | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 02eaa3ce952..22867102978 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -497,7 +497,7 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } -Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, bool lowercase_names) +Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name) { ColumnsWithTypeAndName sample_columns; for (const auto & field : schema.fields()) @@ -518,11 +518,6 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); - if (lowercase_names) - { - boost::to_lower(sample_column.name); - } - sample_columns.emplace_back(std::move(sample_column)); } return Block(std::move(sample_columns)); diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index 38887f06303..d87bbcd0550 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -35,7 +35,7 @@ public: /// Get missing columns that exists in header but not in arrow::Schema std::vector getMissingColumns(const arrow::Schema & schema) const; - static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, bool lowercase_names = false); + static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name); private: const Block & header; From 7e14ab46a3a30d8462f3511d6bb4afb4aae00e57 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 21 Mar 2022 09:03:23 +0000 Subject: [PATCH 007/111] Add tests for case insensitive matching --- .../00900_long_parquet_load.reference | 3 +++ ...case_insensitive_column_matching.reference | 6 +++++ .../02240_case_insensitive_column_matching.sh | 22 ++++++++++++++++++ .../case_insensitive_column_matching.orc | Bin 0 -> 364 bytes .../case_insensitive_column_matching.parquet | Bin 0 -> 811 bytes ...nsensitive_column_matching.parquet.columns | 1 + ...e_insensitive_column_matching.parquet.json | 0 7 files changed, 32 insertions(+) create mode 100644 tests/queries/0_stateless/02240_case_insensitive_column_matching.reference create mode 100755 tests/queries/0_stateless/02240_case_insensitive_column_matching.sh create mode 100644 tests/queries/0_stateless/data_orc/case_insensitive_column_matching.orc create mode 100644 tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet create mode 100644 tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.columns create mode 100644 tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.json diff --git a/tests/queries/0_stateless/00900_long_parquet_load.reference b/tests/queries/0_stateless/00900_long_parquet_load.reference index 6ecff505b2e..98d8e2c5e3e 100644 --- a/tests/queries/0_stateless/00900_long_parquet_load.reference +++ b/tests/queries/0_stateless/00900_long_parquet_load.reference @@ -88,6 +88,9 @@ idx10 ['This','is','a','test'] 22 23 24 +=== Try load data from case_insensitive_column_matching.parquet +123 1 +456 2 === Try load data from datapage_v2.snappy.parquet Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unknown encoding type.: While executing ParquetBlockInputFormat: data for INSERT was parsed from stdin: (in query: INSERT INTO parquet_load FORMAT Parquet). (CANNOT_READ_ALL_DATA) diff --git a/tests/queries/0_stateless/02240_case_insensitive_column_matching.reference b/tests/queries/0_stateless/02240_case_insensitive_column_matching.reference new file mode 100644 index 00000000000..5c383cb3035 --- /dev/null +++ b/tests/queries/0_stateless/02240_case_insensitive_column_matching.reference @@ -0,0 +1,6 @@ +Parquet +123 1 +456 2 +ORC +123 1 +456 2 diff --git a/tests/queries/0_stateless/02240_case_insensitive_column_matching.sh b/tests/queries/0_stateless/02240_case_insensitive_column_matching.sh new file mode 100755 index 00000000000..86e9cb7ee4c --- /dev/null +++ b/tests/queries/0_stateless/02240_case_insensitive_column_matching.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Tags: no-ubsan, no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +echo "Parquet" +DATA_FILE=$CUR_DIR/data_parquet/case_insensitive_column_matching.parquet +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load (iD String, scOre Int32) ENGINE = Memory" +cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO parquet_load FORMAT Parquet SETTINGS input_format_parquet_case_insensitive_column_matching=true" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load" +${CLICKHOUSE_CLIENT} --query="drop table parquet_load" + +echo "ORC" +DATA_FILE=$CUR_DIR/data_orc/case_insensitive_column_matching.orc +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (iD String, sCorE Int32) ENGINE = Memory" +cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_load FORMAT ORC SETTINGS input_format_orc_case_insensitive_column_matching=true" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_load" +${CLICKHOUSE_CLIENT} --query="drop table orc_load" diff --git a/tests/queries/0_stateless/data_orc/case_insensitive_column_matching.orc b/tests/queries/0_stateless/data_orc/case_insensitive_column_matching.orc new file mode 100644 index 0000000000000000000000000000000000000000..136f99800641e8492d33e083c54153b976fdfc55 GIT binary patch literal 364 zcmZvX%SyvQ6o$_w8HeKtVL&njkrG@47bUGX$Vyu9ve1_5UhuLO#0Mxof$!oAw8?eTu|L$}q5MRGR2h^f9BrT1xY)YJYFR zmpAuPN#Ax;$F*vfvo9<}F$sR~P*Is%=LxBx>*pf9dwP9)89)U!l-6$jPux#6HStcf m;ZC!5=lt{b3Ln$a@N77?@A}2%GARjMby%B^ALiL&n*RdqV<(FM literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet b/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet new file mode 100644 index 0000000000000000000000000000000000000000..922def77cafa3b4fe8f35896b3035e521301896d GIT binary patch literal 811 zcmb_b%}(1u5T0Fcqaa17LhQE@cj$nQvyk`POT9xwoX1wz|3pr)G$zR|6tiT3&H( zwb9h1XSAugsxIl0ZPI)II1JE&j;X4|Qwb5s_)1t^i5iMZQ7Ah{S{h}(3G-dV+j05+ zU{nxF0=?3X0`94v9jho?0nFon?GKehL*?p*<|i@$lWKkiR?9mzPu`@%eBcFL33Dyt z-!lFo?E_r@5GbRq28E^5^my>OuNCRaoOI`$qdkk=S|TkLkx`M2YXA2_;>SLKxyFUW zW%Er!i@D=G^07ze z!o<%5N9QMyt|Mr-+AhZs%!fYWcW}wu56c^05B*fe+nyx7YxV7ZGltNdIX$~h7wo_- zFRZ=eNl}DN@OwQk4*{YTxL@*u!e^s?esVrITS!Nvg?^q7FRK@W{Pg4SOLeWjSYM24 PxKwwqH~7-J_)TsBK^}x5 literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.columns b/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.columns new file mode 100644 index 00000000000..e25da8f923d --- /dev/null +++ b/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.columns @@ -0,0 +1 @@ +`Id` Nullable(String), `Score` Nullable(Int32) diff --git a/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.json b/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.json new file mode 100644 index 00000000000..e69de29bb2d From 0457a3998ab3f7c23fd97a7a514cf14ca9ee62d9 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 21 Mar 2022 11:58:55 +0000 Subject: [PATCH 008/111] remove old test --- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 13 +++++------ ..._input_format_use_lowercase_column_name.sh | 22 ------------------- 2 files changed, 6 insertions(+), 29 deletions(-) delete mode 100755 tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.sh diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 22867102978..40d9149a512 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -612,19 +612,19 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & } } - std::shared_ptr arrow_column = name_to_column_ptr[search_column_name]; ColumnWithTypeAndName column; if (read_from_nested) { column = nested_tables[search_nested_table_name]->getByName(header_column.name, case_insensitive_matching); if (case_insensitive_matching) - { column.name = header_column.name; - } } else + { + auto arrow_column = name_to_column_ptr[search_column_name]; column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true); + } try { @@ -655,13 +655,12 @@ std::vector ArrowColumnToCHColumn::getMissingColumns(const arrow::Schema for (size_t i = 0, columns = header.columns(); i < columns; ++i) { const auto & header_column = header.getByPosition(i); - auto column_name = header_column.name; bool read_from_nested = false; - String nested_table_name = Nested::extractTableName(column_name); - if (!block_from_arrow.has(column_name, case_insensitive_matching)) + String nested_table_name = Nested::extractTableName(header_column.name); + if (!block_from_arrow.has(header_column.name, case_insensitive_matching)) { if (import_nested && block_from_arrow.has(nested_table_name, case_insensitive_matching)) - read_from_nested = flatten_block_from_arrow.has(column_name, case_insensitive_matching); + read_from_nested = flatten_block_from_arrow.has(header_column.name, case_insensitive_matching); if (!read_from_nested) { diff --git a/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.sh b/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.sh deleted file mode 100755 index b946addd01c..00000000000 --- a/tests/queries/0_stateless/02233_setting_input_format_use_lowercase_column_name.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash -# Tags: no-ubsan, no-fasttest - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -echo "Parquet" -DATA_FILE=$CUR_DIR/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet -${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS parquet_load" -${CLICKHOUSE_CLIENT} --query="CREATE TABLE parquet_load (id String, score Int32) ENGINE = Memory" -cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO parquet_load FORMAT Parquet SETTINGS input_format_use_lowercase_column_name=true" -${CLICKHOUSE_CLIENT} --query="SELECT * FROM parquet_load" -${CLICKHOUSE_CLIENT} --query="drop table parquet_load" - -echo "ORC" -DATA_FILE=$CUR_DIR/data_orc/test_setting_input_format_use_lowercase_column_name.orc -${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS orc_load" -${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (id String, score Int32) ENGINE = Memory" -cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_load FORMAT ORC SETTINGS input_format_use_lowercase_column_name=true" -${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_load" -${CLICKHOUSE_CLIENT} --query="drop table orc_load" From cb3703b46eb47c0493c1b817976e80ecaaf7353b Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 21 Mar 2022 12:54:56 +0000 Subject: [PATCH 009/111] Style fix --- src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 40d9149a512..4293eb3c1c2 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -180,12 +180,12 @@ static ColumnWithTypeAndName readColumnWithDate32Data(std::shared_ptr(chunk.Value(value_i)); if (days_num > DATE_LUT_MAX_EXTEND_DAY_NUM) - throw Exception{ + throw Exception( ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Input value {} of a column \"{}\" is greater than max allowed Date value, which is {}", days_num, column_name, - DATE_LUT_MAX_DAY_NUM}; + DATE_LUT_MAX_DAY_NUM); column_data.emplace_back(days_num); } From b278600c3eb9462acb00accc36232643a08c9958 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 22 Mar 2022 07:38:26 +0000 Subject: [PATCH 010/111] rename tests --- ...reference => 02242_case_insensitive_column_matching.reference} | 0 ...lumn_matching.sh => 02242_case_insensitive_column_matching.sh} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{02240_case_insensitive_column_matching.reference => 02242_case_insensitive_column_matching.reference} (100%) rename tests/queries/0_stateless/{02240_case_insensitive_column_matching.sh => 02242_case_insensitive_column_matching.sh} (100%) diff --git a/tests/queries/0_stateless/02240_case_insensitive_column_matching.reference b/tests/queries/0_stateless/02242_case_insensitive_column_matching.reference similarity index 100% rename from tests/queries/0_stateless/02240_case_insensitive_column_matching.reference rename to tests/queries/0_stateless/02242_case_insensitive_column_matching.reference diff --git a/tests/queries/0_stateless/02240_case_insensitive_column_matching.sh b/tests/queries/0_stateless/02242_case_insensitive_column_matching.sh similarity index 100% rename from tests/queries/0_stateless/02240_case_insensitive_column_matching.sh rename to tests/queries/0_stateless/02242_case_insensitive_column_matching.sh From ca7844e3384dd8a7fce5ca1442f1bb40be897dda Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 22 Mar 2022 09:27:20 +0000 Subject: [PATCH 011/111] Fix tests --- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 28 ++++++++++++-- .../Formats/Impl/ArrowColumnToCHColumn.h | 3 +- .../02241_parquet_bad_column.reference | 1 + .../0_stateless/02241_parquet_bad_column.sh | 38 ++++++++++--------- 4 files changed, 48 insertions(+), 22 deletions(-) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index fa87c5ef811..16ca4314ed8 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -2,6 +2,7 @@ #include #include +#include "Common/StringUtils/StringUtils.h" #if USE_ARROW || USE_ORC || USE_PARQUET @@ -497,15 +498,35 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } -Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, const Block * hint_header) +Block ArrowColumnToCHColumn::arrowSchemaToCHHeader( + const arrow::Schema & schema, const std::string & format_name, const Block * hint_header, bool ignore_case) { ColumnsWithTypeAndName sample_columns; std::unordered_set nested_table_names; if (hint_header) nested_table_names = Nested::getAllTableNames(*hint_header); + + const auto accept_field = [&](const auto & field_name) + { + if (!hint_header) + return false; + + if (hint_header->has(field_name, ignore_case)) + return true; + + if (ignore_case) + return nested_table_names.contains(field_name); + + return std::find_if( + nested_table_names.begin(), + nested_table_names.end(), + [&](const auto & nested_table_name) { return equalsCaseInsensitive(nested_table_name, field_name); }) + != nested_table_names.end(); + }; + for (const auto & field : schema.fields()) { - if (hint_header && !hint_header->has(field->name()) && !nested_table_names.contains(field->name())) + if (!accept_field(field->name())) continue; /// Create empty arrow column by it's type and convert it to ClickHouse column. @@ -656,8 +677,9 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & std::vector ArrowColumnToCHColumn::getMissingColumns(const arrow::Schema & schema) const { std::vector missing_columns; - auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header); + auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header, case_insensitive_matching); auto flatten_block_from_arrow = Nested::flatten(block_from_arrow); + for (size_t i = 0, columns = header.columns(); i < columns; ++i) { const auto & header_column = header.getByPosition(i); diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index a8b18fabd93..ff99d2b2f11 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -37,7 +37,8 @@ public: /// Transform arrow schema to ClickHouse header. If hint_header is provided, /// we will skip columns in schema that are not in hint_header. - static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr); + static Block arrowSchemaToCHHeader( + const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr, bool ignore_case = false); private: const Block & header; diff --git a/tests/queries/0_stateless/02241_parquet_bad_column.reference b/tests/queries/0_stateless/02241_parquet_bad_column.reference index f599e28b8ab..b2f7f08c170 100644 --- a/tests/queries/0_stateless/02241_parquet_bad_column.reference +++ b/tests/queries/0_stateless/02241_parquet_bad_column.reference @@ -1 +1,2 @@ 10 +10 diff --git a/tests/queries/0_stateless/02241_parquet_bad_column.sh b/tests/queries/0_stateless/02241_parquet_bad_column.sh index a160671a088..9efd11cbbe1 100755 --- a/tests/queries/0_stateless/02241_parquet_bad_column.sh +++ b/tests/queries/0_stateless/02241_parquet_bad_column.sh @@ -5,23 +5,25 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "drop table if exists test_02241" -$CLICKHOUSE_CLIENT -q "create table test_02241 (image_path Nullable(String), - caption Nullable(String), - NSFW Nullable(String), - similarity Nullable(Float64), - LICENSE Nullable(String), - url Nullable(String), - key Nullable(UInt64), - shard_id Nullable(UInt64), - status Nullable(String), - width Nullable(UInt32), - height Nullable(UInt32), - exif Nullable(String), - original_width Nullable(UInt32), - original_height Nullable(UInt32)) engine=Memory" +for case_insensitive in "true" "false"; do + $CLICKHOUSE_CLIENT -q "drop table if exists test_02241" + $CLICKHOUSE_CLIENT -q "create table test_02241 (image_path Nullable(String), + caption Nullable(String), + NSFW Nullable(String), + similarity Nullable(Float64), + LICENSE Nullable(String), + url Nullable(String), + key Nullable(UInt64), + shard_id Nullable(UInt64), + status Nullable(String), + width Nullable(UInt32), + height Nullable(UInt32), + exif Nullable(String), + original_width Nullable(UInt32), + original_height Nullable(UInt32)) engine=Memory" -cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT -q "insert into test_02241 format Parquet" + cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT -q "insert into test_02241 format Parquet SETTINGS input_format_parquet_case_insensitive_column_matching=$case_insensitive" -$CLICKHOUSE_CLIENT -q "select count() from test_02241" -$CLICKHOUSE_CLIENT -q "drop table test_02241" + $CLICKHOUSE_CLIENT -q "select count() from test_02241" + $CLICKHOUSE_CLIENT -q "drop table test_02241" +done From 7c11295228d61c8f689f01f508f6fecfb6edb9a1 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 22 Mar 2022 09:33:11 +0000 Subject: [PATCH 012/111] Remove old test --- .../0_stateless/00900_long_parquet_load.reference | 3 --- ...input_format_use_lowercase_column_name.parquet | Bin 811 -> 0 bytes ...rmat_use_lowercase_column_name.parquet.columns | 1 - 3 files changed, 4 deletions(-) delete mode 100644 tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet delete mode 100644 tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet.columns diff --git a/tests/queries/0_stateless/00900_long_parquet_load.reference b/tests/queries/0_stateless/00900_long_parquet_load.reference index 98d8e2c5e3e..b295a226853 100644 --- a/tests/queries/0_stateless/00900_long_parquet_load.reference +++ b/tests/queries/0_stateless/00900_long_parquet_load.reference @@ -342,9 +342,6 @@ Code: 33. DB::ParsingEx---tion: Error while reading Parquet data: IOError: Unkno (NULL) === Try load data from single_nan.parquet \N -=== Try load data from test_setting_input_format_use_lowercase_column_name.parquet -123 1 -456 2 === Try load data from userdata1.parquet 1454486129 1 Amanda Jordan ajordan0@com.com Female 1.197.201.2 6759521864920116 Indonesia 3/8/1971 49756.53 Internal Auditor 1E+02 1454519043 2 Albert Freeman afreeman1@is.gd Male 218.111.175.34 Canada 1/16/1968 150280.17 Accountant IV diff --git a/tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet b/tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet deleted file mode 100644 index 922def77cafa3b4fe8f35896b3035e521301896d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 811 zcmb_b%}(1u5T0Fcqaa17LhQE@cj$nQvyk`POT9xwoX1wz|3pr)G$zR|6tiT3&H( zwb9h1XSAugsxIl0ZPI)II1JE&j;X4|Qwb5s_)1t^i5iMZQ7Ah{S{h}(3G-dV+j05+ zU{nxF0=?3X0`94v9jho?0nFon?GKehL*?p*<|i@$lWKkiR?9mzPu`@%eBcFL33Dyt z-!lFo?E_r@5GbRq28E^5^my>OuNCRaoOI`$qdkk=S|TkLkx`M2YXA2_;>SLKxyFUW zW%Er!i@D=G^07ze z!o<%5N9QMyt|Mr-+AhZs%!fYWcW}wu56c^05B*fe+nyx7YxV7ZGltNdIX$~h7wo_- zFRZ=eNl}DN@OwQk4*{YTxL@*u!e^s?esVrITS!Nvg?^q7FRK@W{Pg4SOLeWjSYM24 PxKwwqH~7-J_)TsBK^}x5 diff --git a/tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet.columns b/tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet.columns deleted file mode 100644 index e25da8f923d..00000000000 --- a/tests/queries/0_stateless/data_parquet/test_setting_input_format_use_lowercase_column_name.parquet.columns +++ /dev/null @@ -1 +0,0 @@ -`Id` Nullable(String), `Score` Nullable(Int32) From 0c23cd7b94de5e200f69267ade7e59fe392423eb Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 22 Mar 2022 10:55:10 +0000 Subject: [PATCH 013/111] Add support for case insensitive column matching in arrow --- src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp | 6 +++++- src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp | 2 +- 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 6d78b5d71ac..104d6f7c7bb 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -615,6 +615,7 @@ class IColumn; M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \ M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \ M(Bool, input_format_arrow_import_nested, false, "Allow to insert array of structs into Nested table in Arrow input format.", 0) \ + M(Bool, input_format_arrow_case_insensitive_column_matching, false, "Ignore case when matching Arrow columns with CH columns.", 0) \ M(Bool, input_format_orc_import_nested, false, "Allow to insert array of structs into Nested table in ORC input format.", 0) \ M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \ M(Bool, input_format_orc_case_insensitive_column_matching, false, "Ignore case when matching ORC columns with CH columns.", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 8c25fef53cb..3aa82cb79b4 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -123,6 +123,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary; format_settings.arrow.import_nested = settings.input_format_arrow_import_nested; format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns; + format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching; format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index de05dda9138..bd0a84d9ded 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -74,6 +74,7 @@ struct FormatSettings bool low_cardinality_as_dictionary = false; bool import_nested = false; bool allow_missing_columns = false; + bool case_insensitive_column_matching = false; } arrow; struct diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp index cf5cfa681a1..37a107ae367 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp @@ -139,7 +139,11 @@ void ArrowBlockInputFormat::prepareReader() } arrow_column_to_ch_column = std::make_unique( - getPort().getHeader(), "Arrow", format_settings.arrow.import_nested, format_settings.arrow.allow_missing_columns); + getPort().getHeader(), + "Arrow", + format_settings.arrow.import_nested, + format_settings.arrow.allow_missing_columns, + format_settings.arrow.case_insensitive_column_matching); missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema); if (stream) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 16ca4314ed8..ba037b0cf6e 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -514,7 +514,7 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader( if (hint_header->has(field_name, ignore_case)) return true; - if (ignore_case) + if (!ignore_case) return nested_table_names.contains(field_name); return std::find_if( From 0a469066e066cb1ef30411edc122b7dd199e27ec Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 22 Mar 2022 10:55:20 +0000 Subject: [PATCH 014/111] Add more tests --- ...case_insensitive_column_matching.reference | 3 ++ .../02242_case_insensitive_column_matching.sh | 8 ++++++ .../02242_case_insensitive_nested.reference | 12 ++++++++ .../02242_case_insensitive_nested.sh | 26 ++++++++++++++++++ .../case_insensitive_column_matching.arrow | Bin 0 -> 658 bytes 5 files changed, 49 insertions(+) create mode 100644 tests/queries/0_stateless/02242_case_insensitive_nested.reference create mode 100755 tests/queries/0_stateless/02242_case_insensitive_nested.sh create mode 100644 tests/queries/0_stateless/data_arrow/case_insensitive_column_matching.arrow diff --git a/tests/queries/0_stateless/02242_case_insensitive_column_matching.reference b/tests/queries/0_stateless/02242_case_insensitive_column_matching.reference index 5c383cb3035..9732211a286 100644 --- a/tests/queries/0_stateless/02242_case_insensitive_column_matching.reference +++ b/tests/queries/0_stateless/02242_case_insensitive_column_matching.reference @@ -4,3 +4,6 @@ Parquet ORC 123 1 456 2 +Arrow +123 1 +456 2 diff --git a/tests/queries/0_stateless/02242_case_insensitive_column_matching.sh b/tests/queries/0_stateless/02242_case_insensitive_column_matching.sh index 86e9cb7ee4c..8ebf2952ab3 100755 --- a/tests/queries/0_stateless/02242_case_insensitive_column_matching.sh +++ b/tests/queries/0_stateless/02242_case_insensitive_column_matching.sh @@ -20,3 +20,11 @@ ${CLICKHOUSE_CLIENT} --query="CREATE TABLE orc_load (iD String, sCorE Int32) ENG cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO orc_load FORMAT ORC SETTINGS input_format_orc_case_insensitive_column_matching=true" ${CLICKHOUSE_CLIENT} --query="SELECT * FROM orc_load" ${CLICKHOUSE_CLIENT} --query="drop table orc_load" + +echo "Arrow" +DATA_FILE=$CUR_DIR/data_arrow/case_insensitive_column_matching.arrow +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS arrow_load" +${CLICKHOUSE_CLIENT} --query="CREATE TABLE arrow_load (iD String, sCorE Int32) ENGINE = Memory" +cat "$DATA_FILE" | ${CLICKHOUSE_CLIENT} -q "INSERT INTO arrow_load FORMAT Arrow SETTINGS input_format_arrow_case_insensitive_column_matching=true" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM arrow_load" +${CLICKHOUSE_CLIENT} --query="drop table arrow_load" diff --git a/tests/queries/0_stateless/02242_case_insensitive_nested.reference b/tests/queries/0_stateless/02242_case_insensitive_nested.reference new file mode 100644 index 00000000000..58d66d3230a --- /dev/null +++ b/tests/queries/0_stateless/02242_case_insensitive_nested.reference @@ -0,0 +1,12 @@ +Arrow +[1,2,3] ['123','456','789'] [9.8,10.12,11.14] +[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414] +[7,8,9] ['101','415','118'] [13.08,1.12,0.414] +Parquet +[1,2,3] ['123','456','789'] [9.8,10.12,11.14] +[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414] +[7,8,9] ['101','415','118'] [13.08,1.12,0.414] +ORC +[1,2,3] ['123','456','789'] [9.8,10.12,11.14] +[4,5,6] ['101112','131415','161718'] [123.8,10.2,11.414] +[7,8,9] ['101','415','118'] [13.08,1.12,0.414] diff --git a/tests/queries/0_stateless/02242_case_insensitive_nested.sh b/tests/queries/0_stateless/02242_case_insensitive_nested.sh new file mode 100755 index 00000000000..c22f5695dc3 --- /dev/null +++ b/tests/queries/0_stateless/02242_case_insensitive_nested.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS nested_table" +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS nested_nested_table" + +${CLICKHOUSE_CLIENT} --query="CREATE TABLE nested_table (table Nested(eLeM1 Int32, elEm2 String, ELEM3 Float32)) engine=Memory" + +formats=('Arrow' 'Parquet' 'ORC') +format_files=('arrow' 'parquet' 'orc') + +for ((i = 0; i < 3; i++)) do + echo ${formats[i]} + + ${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE nested_table" + cat $CUR_DIR/data_orc_arrow_parquet_nested/nested_table.${format_files[i]} | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nested_table FORMAT ${formats[i]} SETTINGS input_format_${format_files[i]}_import_nested = 1, input_format_${format_files[i]}_case_insensitive_column_matching = true" + + ${CLICKHOUSE_CLIENT} --query="SELECT * FROM nested_table" + +done + +${CLICKHOUSE_CLIENT} --query="DROP TABLE nested_table" diff --git a/tests/queries/0_stateless/data_arrow/case_insensitive_column_matching.arrow b/tests/queries/0_stateless/data_arrow/case_insensitive_column_matching.arrow new file mode 100644 index 0000000000000000000000000000000000000000..4350d5c3e49a3b2d852661d47be41855b6327a5d GIT binary patch literal 658 zcmd5)yAHxI3_MCG70S?|LW~_*c}^@HnLuUW1Aq+)iLc@l_yk5q{s!)x6d@#buCkqT zYoC*}s_WHu1U!igfSd;O;E|$(j4Pt$$U&CTyXb+0Y;76f#(FLj)t52Hz#hP6cRC*! zGj_?)wR+N4NtdHA-Zh0h>cgqefaYj|w3d5uJz?G(V?h%<`f*}!ozQkxU)o!_9=T_2 z*wc|2X{c`l&H4-4zn~f+4Z$?x(t>6dbW#nZ@u{QyH&?DrYx<7Hlj&^ELyF((e)Ns> nJ$Ic_!IOo#92U$EX*wI#Txr5R1iO>Ce20I!&j+Wf>|cBV9KI)< literal 0 HcmV?d00001 From 6b6190554b282725111cc94384cbf3257716ae37 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 22 Mar 2022 11:15:48 +0000 Subject: [PATCH 015/111] Fix conversion of arrow to CH column with hint header --- src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index ba037b0cf6e..91d276ddfe0 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -508,10 +508,7 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader( const auto accept_field = [&](const auto & field_name) { - if (!hint_header) - return false; - - if (hint_header->has(field_name, ignore_case)) + if (!hint_header || hint_header->has(field_name, ignore_case)) return true; if (!ignore_case) From 1e0ad94d6731f1c65dbc5a522451d8d08de5ddda Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 15 Mar 2022 14:08:30 +0100 Subject: [PATCH 016/111] Add a stand alone clickhouse-keeper package --- docker/packager/packager | 1 + packages/clickhouse-keeper.yaml | 38 +++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 packages/clickhouse-keeper.yaml diff --git a/docker/packager/packager b/docker/packager/packager index a5763273f5f..18f56ee93ad 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -163,6 +163,7 @@ def parse_env_variables( cmake_flags.append("-DCMAKE_INSTALL_PREFIX=/usr") cmake_flags.append("-DCMAKE_INSTALL_SYSCONFDIR=/etc") cmake_flags.append("-DCMAKE_INSTALL_LOCALSTATEDIR=/var") + cmake_flags.append("-DBUILD_STANDALONE_KEEPER=ON") if is_release_build(build_type, package_type, sanitizer, split_binary): cmake_flags.append("-DINSTALL_STRIPPED_BINARIES=ON") diff --git a/packages/clickhouse-keeper.yaml b/packages/clickhouse-keeper.yaml new file mode 100644 index 00000000000..e717ba79c5b --- /dev/null +++ b/packages/clickhouse-keeper.yaml @@ -0,0 +1,38 @@ +# package sources should be placed in ${PWD}/root +# nfpm should run from the same directory with a config +name: "clickhouse-keeper" +arch: "${DEB_ARCH}" # amd64, arm64 +platform: "linux" +version: "${CLICKHOUSE_VERSION_STRING}" +vendor: "ClickHouse Inc." +homepage: "https://clickhouse.com" +license: "Apache" +section: "database" +priority: "optional" + +conflicts: +- clickhouse-server +depends: +- adduser + +maintainer: "ClickHouse Dev Team " +description: | + Static clickhouse-keeper binary + A stand-alone clickhouse-keeper package + + +contents: +- src: root/etc/clickhouse-keeper + dst: /etc/clickhouse-keeper + type: config +- src: root/usr/bin/clickhouse-keeper + dst: /usr/bin/clickhouse-keeper +# docs +- src: ../AUTHORS + dst: /usr/share/doc/clickhouse-keeper/AUTHORS +- src: ../CHANGELOG.md + dst: /usr/share/doc/clickhouse-keeper/CHANGELOG.md +- src: ../LICENSE + dst: /usr/share/doc/clickhouse-keeper/LICENSE +- src: ../README.md + dst: /usr/share/doc/clickhouse-keeper/README.md From a4ab73619f012acf2db96b64342829b3be4b804c Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 16 Mar 2022 11:37:18 +0100 Subject: [PATCH 017/111] Fix UBSan build --- programs/keeper/CMakeLists.txt | 6 ------ src/Compression/CompressionFactory.cpp | 17 ++++++++++++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt index 92bb5dc45a3..b1132e3aaea 100644 --- a/programs/keeper/CMakeLists.txt +++ b/programs/keeper/CMakeLists.txt @@ -71,17 +71,11 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressedReadBuffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressedReadBufferFromFile.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressedWriteBuffer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressionCodecDelta.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressionCodecDoubleDelta.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressionCodecEncrypted.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressionCodecGorilla.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressionCodecLZ4.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressionCodecMultiple.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressionCodecNone.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressionCodecT64.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressionCodecZSTD.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/CompressionFactory.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/getCompressionCodecForFile.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/ICompressionCodec.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/LZ4_decompress_faster.cpp diff --git a/src/Compression/CompressionFactory.cpp b/src/Compression/CompressionFactory.cpp index ca5e5176d13..8dfc894e15b 100644 --- a/src/Compression/CompressionFactory.cpp +++ b/src/Compression/CompressionFactory.cpp @@ -165,25 +165,36 @@ void registerCodecNone(CompressionCodecFactory & factory); void registerCodecLZ4(CompressionCodecFactory & factory); void registerCodecLZ4HC(CompressionCodecFactory & factory); void registerCodecZSTD(CompressionCodecFactory & factory); +void registerCodecMultiple(CompressionCodecFactory & factory); + + +/// Keeper use only general-purpose codes, so we don't need these special codecs +/// in standalone build +#ifndef KEEPER_STANDALONE_BUILD + void registerCodecDelta(CompressionCodecFactory & factory); void registerCodecT64(CompressionCodecFactory & factory); void registerCodecDoubleDelta(CompressionCodecFactory & factory); void registerCodecGorilla(CompressionCodecFactory & factory); void registerCodecEncrypted(CompressionCodecFactory & factory); -void registerCodecMultiple(CompressionCodecFactory & factory); + +#endif CompressionCodecFactory::CompressionCodecFactory() { - registerCodecLZ4(*this); registerCodecNone(*this); + registerCodecLZ4(*this); registerCodecZSTD(*this); registerCodecLZ4HC(*this); + registerCodecMultiple(*this); + +#ifndef KEEPER_STANDALONE_BUILD registerCodecDelta(*this); registerCodecT64(*this); registerCodecDoubleDelta(*this); registerCodecGorilla(*this); registerCodecEncrypted(*this); - registerCodecMultiple(*this); +#endif default_codec = get("LZ4", {}); } From e790a7308121427ecb68ae957af8af9d14393be5 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 23 Mar 2022 15:14:30 +0100 Subject: [PATCH 018/111] Simplify strip for new packages --- cmake/strip.sh | 28 ------------------ cmake/strip_binary.cmake | 33 ++++++++++++++++++++-- packages/clickhouse-common-static-dbg.yaml | 8 ++++-- programs/CMakeLists.txt | 9 +----- programs/keeper/CMakeLists.txt | 7 ++++- programs/library-bridge/CMakeLists.txt | 1 + programs/odbc-bridge/CMakeLists.txt | 1 + 7 files changed, 45 insertions(+), 42 deletions(-) delete mode 100755 cmake/strip.sh diff --git a/cmake/strip.sh b/cmake/strip.sh deleted file mode 100755 index f85d82fab31..00000000000 --- a/cmake/strip.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -BINARY_PATH=$1 -BINARY_NAME=$(basename "$BINARY_PATH") -DESTINATION_STRIPPED_DIR=$2 -OBJCOPY_PATH=${3:objcopy} -READELF_PATH=${4:readelf} - -BUILD_ID=$($READELF_PATH -n "$1" | sed -n '/Build ID/ { s/.*: //p; q; }') -BUILD_ID_PREFIX=${BUILD_ID:0:2} -BUILD_ID_SUFFIX=${BUILD_ID:2} - -DESTINATION_DEBUG_INFO_DIR="$DESTINATION_STRIPPED_DIR/lib/debug/.build-id" -DESTINATION_STRIP_BINARY_DIR="$DESTINATION_STRIPPED_DIR/bin" - -mkdir -p "$DESTINATION_DEBUG_INFO_DIR/$BUILD_ID_PREFIX" -mkdir -p "$DESTINATION_STRIP_BINARY_DIR" - - -cp "$BINARY_PATH" "$DESTINATION_STRIP_BINARY_DIR/$BINARY_NAME" - -$OBJCOPY_PATH --only-keep-debug --compress-debug-sections "$DESTINATION_STRIP_BINARY_DIR/$BINARY_NAME" "$DESTINATION_DEBUG_INFO_DIR/$BUILD_ID_PREFIX/$BUILD_ID_SUFFIX.debug" -chmod 0644 "$DESTINATION_DEBUG_INFO_DIR/$BUILD_ID_PREFIX/$BUILD_ID_SUFFIX.debug" -chown 0:0 "$DESTINATION_DEBUG_INFO_DIR/$BUILD_ID_PREFIX/$BUILD_ID_SUFFIX.debug" - -strip --remove-section=.comment --remove-section=.note "$DESTINATION_STRIP_BINARY_DIR/$BINARY_NAME" - -$OBJCOPY_PATH --add-gnu-debuglink "$DESTINATION_DEBUG_INFO_DIR/$BUILD_ID_PREFIX/$BUILD_ID_SUFFIX.debug" "$DESTINATION_STRIP_BINARY_DIR/$BINARY_NAME" diff --git a/cmake/strip_binary.cmake b/cmake/strip_binary.cmake index e430807772d..6d0b7227c54 100644 --- a/cmake/strip_binary.cmake +++ b/cmake/strip_binary.cmake @@ -11,16 +11,43 @@ macro(clickhouse_strip_binary) message(FATAL_ERROR "A binary path name must be provided for stripping binary") endif() - if (NOT DEFINED STRIP_DESTINATION_DIR) message(FATAL_ERROR "Destination directory for stripped binary must be provided") endif() add_custom_command(TARGET ${STRIP_TARGET} POST_BUILD - COMMAND bash ${ClickHouse_SOURCE_DIR}/cmake/strip.sh ${STRIP_BINARY_PATH} ${STRIP_DESTINATION_DIR} ${OBJCOPY_PATH} ${READELF_PATH} - COMMENT "Stripping clickhouse binary" VERBATIM + COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/lib/debug/" + COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/bin" + COMMAND cp "${STRIP_BINARY_PATH}" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" + COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/${STRIP_TARGET}.debug" + COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/${STRIP_TARGET}.debug" + COMMAND strip --remove-section=.comment --remove-section=.note "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" + COMMAND "${OBJCOPY_PATH}" --add-gnu-debuglink "${STRIP_DESTINATION_DIR}/lib/debug/${STRIP_TARGET}.debug" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" + COMMENT "Stripping clickhouse binary" VERBATIM ) install(PROGRAMS ${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET} DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) install(DIRECTORY ${STRIP_DESTINATION_DIR}/lib/debug DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT clickhouse) endmacro() + + +macro(clickhouse_make_empty_debug_info_for_nfpm) + set(oneValueArgs TARGET DESTINATION_DIR) + cmake_parse_arguments(EMPTY_DEBUG "" "${oneValueArgs}" "" ${ARGN}) + + if (NOT DEFINED EMPTY_DEBUG_TARGET) + message(FATAL_ERROR "A target name must be provided for stripping binary") + endif() + + if (NOT DEFINED EMPTY_DEBUG_DESTINATION_DIR) + message(FATAL_ERROR "Destination directory for empty debug must be provided") + endif() + + add_custom_command(TARGET ${EMPTY_DEBUG_TARGET} POST_BUILD + COMMAND mkdir -p "${EMPTY_DEBUG_DESTINATION_DIR}/lib/debug" + COMMAND touch "${EMPTY_DEBUG_DESTINATION_DIR}/lib/debug/${EMPTY_DEBUG_TARGET}.debug" + COMMENT "Addiding empty debug info for NFPM" VERBATIM + ) + + install(FILES "${EMPTY_DEBUG_DESTINATION_DIR}/lib/debug/${EMPTY_DEBUG_TARGET}.debug" DESTINATION "${CMAKE_INSTALL_LIBDIR}/debug" COMPONENT clickhouse) +endmacro() diff --git a/packages/clickhouse-common-static-dbg.yaml b/packages/clickhouse-common-static-dbg.yaml index 1213f4215c8..349f9ec0c47 100644 --- a/packages/clickhouse-common-static-dbg.yaml +++ b/packages/clickhouse-common-static-dbg.yaml @@ -21,8 +21,12 @@ description: | This package contains the debugging symbols for clickhouse-common. contents: -- src: root/usr/lib/debug - dst: /usr/lib/debug +- src: root/usr/lib/debug/clickhouse.debug + dst: /usr/lib/debug/clickhouse.debug +- src: root/usr/lib/debug/clickhouse-odbc-bridge.debug + dst: /usr/lib/debug/clickhouse-odbc-bridge.debug +- src: root/usr/lib/debug/clickhouse-library-bridge.debug + dst: /usr/lib/debug/clickhouse-library-bridge.debug # docs - src: ../AUTHORS dst: /usr/share/doc/clickhouse-common-static-dbg/AUTHORS diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index 1e2420021b6..cca7be97b61 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -473,18 +473,11 @@ else () if (INSTALL_STRIPPED_BINARIES) clickhouse_strip_binary(TARGET clickhouse DESTINATION_DIR ${CMAKE_CURRENT_BINARY_DIR}/${STRIPPED_BINARIES_OUTPUT} BINARY_PATH clickhouse) else() + clickhouse_make_empty_debug_info_for_nfpm(TARGET clickhouse DESTINATION_DIR ${CMAKE_CURRENT_BINARY_DIR}/${STRIPPED_BINARIES_OUTPUT}) install (TARGETS clickhouse RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) endif() endif() -if (NOT INSTALL_STRIPPED_BINARIES) - # Install dunny debug directory - # TODO: move logic to every place where clickhouse_strip_binary is used - add_custom_command(TARGET clickhouse POST_BUILD COMMAND echo > .empty ) - install(FILES "${CMAKE_CURRENT_BINARY_DIR}/.empty" DESTINATION ${CMAKE_INSTALL_LIBDIR}/debug/.empty) -endif() - - if (ENABLE_TESTS) set (CLICKHOUSE_UNIT_TESTS_TARGETS unit_tests_dbms) add_custom_target (clickhouse-tests ALL DEPENDS ${CLICKHOUSE_UNIT_TESTS_TARGETS}) diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt index 92bb5dc45a3..9491d503fbf 100644 --- a/programs/keeper/CMakeLists.txt +++ b/programs/keeper/CMakeLists.txt @@ -137,5 +137,10 @@ if (BUILD_STANDALONE_KEEPER) add_dependencies(clickhouse-keeper clickhouse_keeper_configs) set_target_properties(clickhouse-keeper PROPERTIES RUNTIME_OUTPUT_DIRECTORY ../) - install(TARGETS clickhouse-keeper RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) + if (INSTALL_STRIPPED_BINARIES) + clickhouse_strip_binary(TARGET clickhouse-keeper DESTINATION_DIR ${CMAKE_CURRENT_BINARY_DIR}/../${STRIPPED_BINARIES_OUTPUT} BINARY_PATH ../clickhouse-keeper) + else() + clickhouse_make_empty_debug_info_for_nfpm(TARGET clickhouse-keeper DESTINATION_DIR ${CMAKE_CURRENT_BINARY_DIR}/../${STRIPPED_BINARIES_OUTPUT}) + install(TARGETS clickhouse-keeper RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) + endif() endif() diff --git a/programs/library-bridge/CMakeLists.txt b/programs/library-bridge/CMakeLists.txt index aded9664b35..90ce3d8be7f 100644 --- a/programs/library-bridge/CMakeLists.txt +++ b/programs/library-bridge/CMakeLists.txt @@ -27,5 +27,6 @@ set_target_properties(clickhouse-library-bridge PROPERTIES RUNTIME_OUTPUT_DIRECT if (INSTALL_STRIPPED_BINARIES) clickhouse_strip_binary(TARGET clickhouse-library-bridge DESTINATION_DIR ${CMAKE_CURRENT_BINARY_DIR}/../${STRIPPED_BINARIES_OUTPUT} BINARY_PATH ../clickhouse-library-bridge) else() + clickhouse_make_empty_debug_info_for_nfpm(TARGET clickhouse-library-bridge DESTINATION_DIR ${CMAKE_CURRENT_BINARY_DIR}/../${STRIPPED_BINARIES_OUTPUT}) install(TARGETS clickhouse-library-bridge RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) endif() diff --git a/programs/odbc-bridge/CMakeLists.txt b/programs/odbc-bridge/CMakeLists.txt index 50a8bb629c8..b530e08ca26 100644 --- a/programs/odbc-bridge/CMakeLists.txt +++ b/programs/odbc-bridge/CMakeLists.txt @@ -42,6 +42,7 @@ endif() if (INSTALL_STRIPPED_BINARIES) clickhouse_strip_binary(TARGET clickhouse-odbc-bridge DESTINATION_DIR ${CMAKE_CURRENT_BINARY_DIR}/../${STRIPPED_BINARIES_OUTPUT} BINARY_PATH ../clickhouse-odbc-bridge) else() + clickhouse_make_empty_debug_info_for_nfpm(TARGET clickhouse-odbc-bridge DESTINATION_DIR ${CMAKE_CURRENT_BINARY_DIR}/../${STRIPPED_BINARIES_OUTPUT}) install(TARGETS clickhouse-odbc-bridge RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) endif() From df0d3c93040c1cfb7896b3fb8c80b15c83f8a7e0 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 23 Mar 2022 16:11:46 +0100 Subject: [PATCH 019/111] Fix parsing of IPv6 addresses longer than 39 characters --- src/Common/formatIPv6.h | 2 +- .../0_stateless/02243_ipv6_long_parsing.reference | 3 +++ tests/queries/0_stateless/02243_ipv6_long_parsing.sql | 10 ++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02243_ipv6_long_parsing.reference create mode 100644 tests/queries/0_stateless/02243_ipv6_long_parsing.sql diff --git a/src/Common/formatIPv6.h b/src/Common/formatIPv6.h index 1a65adae55b..d6efeed17e6 100644 --- a/src/Common/formatIPv6.h +++ b/src/Common/formatIPv6.h @@ -11,7 +11,7 @@ constexpr size_t IPV4_BINARY_LENGTH = 4; constexpr size_t IPV6_BINARY_LENGTH = 16; constexpr size_t IPV4_MAX_TEXT_LENGTH = 15; /// Does not count tail zero byte. -constexpr size_t IPV6_MAX_TEXT_LENGTH = 39; +constexpr size_t IPV6_MAX_TEXT_LENGTH = 45; /// Does not count tail zero byte. namespace DB { diff --git a/tests/queries/0_stateless/02243_ipv6_long_parsing.reference b/tests/queries/0_stateless/02243_ipv6_long_parsing.reference new file mode 100644 index 00000000000..c09bfebe9d5 --- /dev/null +++ b/tests/queries/0_stateless/02243_ipv6_long_parsing.reference @@ -0,0 +1,3 @@ +0 ::ffff:1.12.12.12 +1 ::ffff:123.123.123.123 +2 ::ffff:192.168.100.228 diff --git a/tests/queries/0_stateless/02243_ipv6_long_parsing.sql b/tests/queries/0_stateless/02243_ipv6_long_parsing.sql new file mode 100644 index 00000000000..25225ee0fa8 --- /dev/null +++ b/tests/queries/0_stateless/02243_ipv6_long_parsing.sql @@ -0,0 +1,10 @@ +DROP TABLE IF EXISTS test_table; +CREATE TABLE test_table (id UInt64, value IPv6) ENGINE=MergeTree ORDER BY id; + +INSERT INTO test_table VALUES (0, '0000:0000:0000:0000:0000:ffff:1.12.12.12'); +INSERT INTO test_table VALUES (1, '0000:0000:0000:0000:0000:ffff:123.123.123.123'); +INSERT INTO test_table VALUES (2, '0000:0000:0000:0000:0000:ffff:192.168.100.228'); + +SELECT * FROM test_table ORDER BY id; + +DROP TABLE test_table; From e0694ea5ba5b301275670c4a772660600ec99f32 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Wed, 23 Mar 2022 23:22:37 +0800 Subject: [PATCH 020/111] Pasting improvement of clickhouse-client --- contrib/replxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/replxx b/contrib/replxx index 9460e5e0fc1..6f0b6f151ae 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit 9460e5e0fc10f78f460af26a6bd928798cac864d +Subproject commit 6f0b6f151ae2a044625ae93acd19ca365fcea64d From 052057f2ef4226adc606366340c94ce0c0ff2715 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 23 Mar 2022 15:02:19 +0000 Subject: [PATCH 021/111] Address PR comments --- src/Common/StringUtils/StringUtils.h | 9 - src/Core/Block.cpp | 5 +- src/DataTypes/NestedUtils.cpp | 9 +- src/DataTypes/NestedUtils.h | 2 +- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 164 +++++++----------- .../Formats/Impl/ArrowColumnToCHColumn.h | 8 +- .../Formats/Impl/ORCBlockInputFormat.cpp | 73 ++++---- .../Formats/Impl/ORCBlockInputFormat.h | 2 - .../Formats/Impl/ParquetBlockInputFormat.cpp | 86 ++++----- ...e_insensitive_column_matching.parquet.json | 0 10 files changed, 148 insertions(+), 210 deletions(-) delete mode 100644 tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.json diff --git a/src/Common/StringUtils/StringUtils.h b/src/Common/StringUtils/StringUtils.h index e1a753e816d..21df0f5ae8b 100644 --- a/src/Common/StringUtils/StringUtils.h +++ b/src/Common/StringUtils/StringUtils.h @@ -240,15 +240,6 @@ inline bool equalsCaseInsensitive(char a, char b) return a == b || (isAlphaASCII(a) && alternateCaseIfAlphaASCII(a) == b); } -inline bool equalsCaseInsensitive(const std::string_view a, const std::string_view b) -{ - if (a.length() != b.length()) - return false; - - return std::equal( - a.begin(), a.end(), b.begin(), [](const auto first, const auto second) { return equalsCaseInsensitive(first, second); }); -} - template std::string trim(const std::string & str, F && predicate) diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 306f99d7c24..a7142ef7f2e 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -13,6 +13,7 @@ #include #include +#include namespace DB @@ -273,7 +274,7 @@ const ColumnWithTypeAndName * Block::findByName(const std::string & name, bool c { if (case_insensitive) { - auto found = std::find_if(data.begin(), data.end(), [&](const auto & column) { return equalsCaseInsensitive(column.name, name); }); + auto found = std::find_if(data.begin(), data.end(), [&](const auto & column) { return boost::iequals(column.name, name); }); if (found == data.end()) { return nullptr; @@ -304,7 +305,7 @@ const ColumnWithTypeAndName & Block::getByName(const std::string & name, bool ca bool Block::has(const std::string & name, bool case_insensitive) const { if (case_insensitive) - return std::find_if(data.begin(), data.end(), [&](const auto & column) { return equalsCaseInsensitive(column.name, name); }) + return std::find_if(data.begin(), data.end(), [&](const auto & column) { return boost::iequals(column.name, name); }) != data.end(); return index_by_name.end() != index_by_name.find(name); diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index df504bc34a8..cfacdd252e2 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -15,6 +15,8 @@ #include +#include + namespace DB { @@ -227,12 +229,15 @@ void validateArraySizes(const Block & block) } -std::unordered_set getAllTableNames(const Block & block) +std::unordered_set getAllTableNames(const Block & block, bool to_lower_case) { std::unordered_set nested_table_names; - for (auto & name : block.getNames()) + for (const auto & name : block.getNames()) { auto nested_table_name = Nested::extractTableName(name); + if (to_lower_case) + boost::to_lower(nested_table_name); + if (!nested_table_name.empty()) nested_table_names.insert(nested_table_name); } diff --git a/src/DataTypes/NestedUtils.h b/src/DataTypes/NestedUtils.h index 2ca5c17dc74..f6dc42d5c58 100644 --- a/src/DataTypes/NestedUtils.h +++ b/src/DataTypes/NestedUtils.h @@ -32,7 +32,7 @@ namespace Nested void validateArraySizes(const Block & block); /// Get all nested tables names from a block. - std::unordered_set getAllTableNames(const Block & block); + std::unordered_set getAllTableNames(const Block & block, bool to_lower_case = false); } } diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 91d276ddfe0..0a72e561e4e 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -1,44 +1,40 @@ #include "ArrowColumnToCHColumn.h" -#include -#include -#include "Common/StringUtils/StringUtils.h" - #if USE_ARROW || USE_ORC || USE_PARQUET -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include - +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn. -# define FOR_ARROW_NUMERIC_TYPES(M) \ +#define FOR_ARROW_NUMERIC_TYPES(M) \ M(arrow::Type::UINT8, DB::UInt8) \ M(arrow::Type::INT8, DB::Int8) \ M(arrow::Type::INT16, DB::Int16) \ @@ -49,7 +45,7 @@ M(arrow::Type::FLOAT, DB::Float32) \ M(arrow::Type::DOUBLE, DB::Float64) -# define FOR_ARROW_INDEXES_TYPES(M) \ +#define FOR_ARROW_INDEXES_TYPES(M) \ M(arrow::Type::UINT8, DB::UInt8) \ M(arrow::Type::INT8, DB::UInt8) \ M(arrow::Type::UINT16, DB::UInt16) \ @@ -73,7 +69,6 @@ namespace ErrorCodes extern const int INCORRECT_NUMBER_OF_COLUMNS; } - /// Inserts numeric data right into internal column data to reduce an overhead template > static ColumnWithTypeAndName readColumnWithNumericData(std::shared_ptr & arrow_column, const String & column_name) @@ -181,12 +176,8 @@ static ColumnWithTypeAndName readColumnWithDate32Data(std::shared_ptr(chunk.Value(value_i)); if (days_num > DATE_LUT_MAX_EXTEND_DAY_NUM) - throw Exception( - ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, - "Input value {} of a column \"{}\" is greater than max allowed Date value, which is {}", - days_num, - column_name, - DATE_LUT_MAX_DAY_NUM); + throw Exception{ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, + "Input value {} of a column \"{}\" is greater than max allowed Date value, which is {}", days_num, column_name, DATE_LUT_MAX_DAY_NUM}; column_data.emplace_back(days_num); } @@ -235,8 +226,7 @@ static ColumnWithTypeAndName readColumnWithTimestampData(std::shared_ptr -static ColumnWithTypeAndName -readColumnWithDecimalDataImpl(std::shared_ptr & arrow_column, const String & column_name, DataTypePtr internal_type) +static ColumnWithTypeAndName readColumnWithDecimalDataImpl(std::shared_ptr & arrow_column, const String & column_name, DataTypePtr internal_type) { auto internal_column = internal_type->createColumn(); auto & column = assert_cast &>(*internal_column); @@ -248,8 +238,7 @@ readColumnWithDecimalDataImpl(std::shared_ptr & arrow_colum auto & chunk = dynamic_cast(*(arrow_column->chunk(chunk_i))); for (size_t value_i = 0, length = static_cast(chunk.length()); value_i < length; ++value_i) { - column_data.emplace_back( - chunk.IsNull(value_i) ? DecimalType(0) : *reinterpret_cast(chunk.Value(value_i))); // TODO: copy column + column_data.emplace_back(chunk.IsNull(value_i) ? DecimalType(0) : *reinterpret_cast(chunk.Value(value_i))); // TODO: copy column } } return {std::move(internal_column), internal_type, column_name}; @@ -310,9 +299,10 @@ static ColumnPtr readColumnWithIndexesData(std::shared_ptr switch (arrow_column->type()->id()) { # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ - case ARROW_NUMERIC_TYPE: { \ - return readColumnWithNumericData(arrow_column, "").column; \ - } + case ARROW_NUMERIC_TYPE: \ + { \ + return readColumnWithNumericData(arrow_column, "").column; \ + } FOR_ARROW_INDEXES_TYPES(DISPATCH) # undef DISPATCH default: @@ -366,13 +356,15 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( // ClickHouse writes Date as arrow UINT16 and DateTime as arrow UINT32, // so, read UINT16 as Date and UINT32 as DateTime to perform correct conversion // between Date and DateTime further. - case arrow::Type::UINT16: { + case arrow::Type::UINT16: + { auto column = readColumnWithNumericData(arrow_column, column_name); if (read_ints_as_dates) column.type = std::make_shared(); return column; } - case arrow::Type::UINT32: { + case arrow::Type::UINT32: + { auto column = readColumnWithNumericData(arrow_column, column_name); if (read_ints_as_dates) column.type = std::make_shared(); @@ -384,10 +376,10 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::DECIMAL256: return readColumnWithDecimalData(arrow_column, column_name); - case arrow::Type::MAP: { + case arrow::Type::MAP: + { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column - = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); const auto * tuple_column = assert_cast(nested_column.column.get()); @@ -396,16 +388,17 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( auto map_type = std::make_shared(tuple_type->getElements()[0], tuple_type->getElements()[1]); return {std::move(map_column), std::move(map_type), column_name}; } - case arrow::Type::LIST: { + case arrow::Type::LIST: + { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column - = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); auto array_column = ColumnArray::create(nested_column.column, offsets_column); auto array_type = std::make_shared(nested_column.type); return {std::move(array_column), std::move(array_type), column_name}; } - case arrow::Type::STRUCT: { + case arrow::Type::STRUCT: + { auto arrow_type = arrow_column->type(); auto * arrow_struct_type = assert_cast(arrow_type.get()); std::vector nested_arrow_columns(arrow_struct_type->num_fields()); @@ -423,8 +416,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( for (int i = 0; i != arrow_struct_type->num_fields(); ++i) { auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); - auto element = readColumnFromArrowColumn( - nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates); + auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates); tuple_elements.emplace_back(std::move(element.column)); tuple_types.emplace_back(std::move(element.type)); tuple_names.emplace_back(std::move(element.name)); @@ -434,7 +426,8 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( auto tuple_type = std::make_shared(std::move(tuple_types), std::move(tuple_names)); return {std::move(tuple_column), std::move(tuple_type), column_name}; } - case arrow::Type::DICTIONARY: { + case arrow::Type::DICTIONARY: + { auto & dict_values = dictionary_values[column_name]; /// Load dictionary values only once and reuse it. if (!dict_values) @@ -446,14 +439,12 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( dict_array.emplace_back(dict_chunk.dictionary()); } auto arrow_dict_column = std::make_shared(dict_array); - auto dict_column - = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); + auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); /// We should convert read column to ColumnUnique. auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn(); auto tmp_dict_column = IColumn::mutate(assert_cast(tmp_lc_column.get())->getDictionaryPtr()); - static_cast(tmp_dict_column.get()) - ->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size()); + static_cast(tmp_dict_column.get())->uniqueInsertRangeFrom(*dict_column.column, 0, dict_column.column->size()); dict_column.column = std::move(tmp_dict_column); dict_values = std::make_shared(std::move(dict_column)); } @@ -474,17 +465,13 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( # define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \ case ARROW_NUMERIC_TYPE: \ return readColumnWithNumericData(arrow_column, column_name); - FOR_ARROW_NUMERIC_TYPES(DISPATCH) + FOR_ARROW_NUMERIC_TYPES(DISPATCH) # undef DISPATCH // TODO: read JSON as a string? // TODO: read UUID as a string? default: - throw Exception( - ErrorCodes::UNKNOWN_TYPE, - "Unsupported {} type '{}' of an input column '{}'.", - format_name, - arrow_column->type()->name(), - column_name); + throw Exception(ErrorCodes::UNKNOWN_TYPE, + "Unsupported {} type '{}' of an input column '{}'.", format_name, arrow_column->type()->name(), column_name); } } @@ -504,26 +491,12 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader( ColumnsWithTypeAndName sample_columns; std::unordered_set nested_table_names; if (hint_header) - nested_table_names = Nested::getAllTableNames(*hint_header); - - const auto accept_field = [&](const auto & field_name) - { - if (!hint_header || hint_header->has(field_name, ignore_case)) - return true; - - if (!ignore_case) - return nested_table_names.contains(field_name); - - return std::find_if( - nested_table_names.begin(), - nested_table_names.end(), - [&](const auto & nested_table_name) { return equalsCaseInsensitive(nested_table_name, field_name); }) - != nested_table_names.end(); - }; + nested_table_names = Nested::getAllTableNames(*hint_header, ignore_case); for (const auto & field : schema.fields()) { - if (!accept_field(field->name())) + if (hint_header && !hint_header->has(field->name(), ignore_case) + && !nested_table_names.contains(ignore_case ? boost::to_lower_copy(field->name()) : field->name())) continue; /// Create empty arrow column by it's type and convert it to ClickHouse column. @@ -539,8 +512,7 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader( arrow::ArrayVector array_vector = {arrow_array}; auto arrow_column = std::make_shared(array_vector); std::unordered_map> dict_values; - ColumnWithTypeAndName sample_column - = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); + ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); sample_columns.emplace_back(std::move(sample_column)); } @@ -571,9 +543,7 @@ void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptrsecond->length(); columns_list.reserve(header.rows()); @@ -594,17 +565,14 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & auto search_column_name = header_column.name; if (case_insensitive_matching) - { boost::to_lower(search_column_name); - } bool read_from_nested = false; String nested_table_name = Nested::extractTableName(header_column.name); String search_nested_table_name = nested_table_name; if (case_insensitive_matching) - { boost::to_lower(search_nested_table_name); - } + if (!name_to_column_ptr.contains(search_column_name)) { /// Check if it's a column from nested table. diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index ff99d2b2f11..0a712326941 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -4,10 +4,10 @@ #if USE_ARROW || USE_ORC || USE_PARQUET -# include -# include -# include -# include +#include +#include +#include +#include namespace DB diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index 1eab922c397..c68b59833db 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -1,14 +1,14 @@ #include "ORCBlockInputFormat.h" -#include "Common/StringUtils/StringUtils.h" +#include #if USE_ORC -# include -# include -# include -# include -# include -# include "ArrowBufferedStreams.h" -# include "ArrowColumnToCHColumn.h" +#include +#include +#include +#include +#include "ArrowBufferedStreams.h" +#include "ArrowColumnToCHColumn.h" +#include namespace DB { @@ -138,9 +138,10 @@ void ORCBlockInputFormat::prepareReader() format_settings.orc.case_insensitive_column_matching); missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema); + const bool ignore_case = format_settings.orc.case_insensitive_column_matching; std::unordered_set nested_table_names; if (format_settings.orc.import_nested) - nested_table_names = Nested::getAllTableNames(getPort().getHeader()); + nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case); /// In ReadStripe column indices should be started from 1, /// because 0 indicates to select all columns. @@ -151,29 +152,8 @@ void ORCBlockInputFormat::prepareReader() /// so we should recursively count the number of indices we need for this type. int indexes_count = countIndicesForType(schema->field(i)->type()); const auto & name = schema->field(i)->name(); - const bool contains_column = std::invoke( - [&] - { - if (getPort().getHeader().has(name, format_settings.parquet.case_insensitive_column_matching)) - { - return true; - } - - if (!format_settings.parquet.case_insensitive_column_matching) - { - return nested_table_names.contains(name); - } - - return std::find_if( - nested_table_names.begin(), - nested_table_names.end(), - [&](const auto & nested_table_name) { return equalsCaseInsensitive(nested_table_name, name); }) - != nested_table_names.end(); - }); - - if (contains_column) + if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name)) { - column_names.push_back(name); for (int j = 0; j != indexes_count; ++j) include_indices.push_back(index + j); } @@ -200,9 +180,14 @@ NamesAndTypesList ORCSchemaReader::readSchema() void registerInputFormatORC(FormatFactory & factory) { factory.registerInputFormat( - "ORC", - [](ReadBuffer & buf, const Block & sample, const RowInputFormatParams &, const FormatSettings & settings) - { return std::make_shared(buf, sample, settings); }); + "ORC", + [](ReadBuffer &buf, + const Block &sample, + const RowInputFormatParams &, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, settings); + }); factory.markFormatAsColumnOriented("ORC"); } @@ -210,7 +195,11 @@ void registerORCSchemaReader(FormatFactory & factory) { factory.registerSchemaReader( "ORC", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) { return std::make_shared(buf, settings); }); + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + } + ); } } @@ -218,14 +207,14 @@ void registerORCSchemaReader(FormatFactory & factory) namespace DB { -class FormatFactory; -void registerInputFormatORC(FormatFactory &) -{ -} + class FormatFactory; + void registerInputFormatORC(FormatFactory &) + { + } -void registerORCSchemaReader(FormatFactory &) -{ -} + void registerORCSchemaReader(FormatFactory &) + { + } } #endif diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index bb136d02d6e..b7a771730ea 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -45,8 +45,6 @@ private: std::unique_ptr arrow_column_to_ch_column; - std::vector column_names; - // indices of columns to read from ORC file std::vector include_indices; diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index c2e3c71d671..13582ce5019 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -1,21 +1,19 @@ #include "ParquetBlockInputFormat.h" -#include "Common/StringUtils/StringUtils.h" +#include + #if USE_PARQUET -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include "ArrowBufferedStreams.h" -# include "ArrowColumnToCHColumn.h" - -# include - +#include +#include +#include +#include +#include +#include +#include +#include +#include "ArrowBufferedStreams.h" +#include "ArrowColumnToCHColumn.h" +#include namespace DB { @@ -26,12 +24,12 @@ namespace ErrorCodes extern const int CANNOT_READ_ALL_DATA; } -# define THROW_ARROW_NOT_OK(status) \ - do \ - { \ - if (::arrow::Status _s = (status); !_s.ok()) \ - throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \ - } while (false) +#define THROW_ARROW_NOT_OK(status) \ + do \ + { \ + if (::arrow::Status _s = (status); !_s.ok()) \ + throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \ + } while (false) ParquetBlockInputFormat::ParquetBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) : IInputFormat(std::move(header_), in_), format_settings(format_settings_) @@ -140,9 +138,10 @@ void ParquetBlockInputFormat::prepareReader() format_settings.parquet.case_insensitive_column_matching); missing_columns = arrow_column_to_ch_column->getMissingColumns(*schema); + const bool ignore_case = format_settings.parquet.case_insensitive_column_matching; std::unordered_set nested_table_names; if (format_settings.parquet.import_nested) - nested_table_names = Nested::getAllTableNames(getPort().getHeader()); + nested_table_names = Nested::getAllTableNames(getPort().getHeader(), ignore_case); int index = 0; for (int i = 0; i < schema->num_fields(); ++i) @@ -153,27 +152,7 @@ void ParquetBlockInputFormat::prepareReader() int indexes_count = countIndicesForType(schema->field(i)->type()); const auto & name = schema->field(i)->name(); - const bool contains_column = std::invoke( - [&] - { - if (getPort().getHeader().has(name, format_settings.parquet.case_insensitive_column_matching)) - { - return true; - } - - if (!format_settings.parquet.case_insensitive_column_matching) - { - return nested_table_names.contains(name); - } - - return std::find_if( - nested_table_names.begin(), - nested_table_names.end(), - [&](const auto & nested_table_name) { return equalsCaseInsensitive(nested_table_name, name); }) - != nested_table_names.end(); - }); - - if (contains_column) + if (getPort().getHeader().has(name, ignore_case) || nested_table_names.contains(ignore_case ? boost::to_lower_copy(name) : name)) { for (int j = 0; j != indexes_count; ++j) column_indices.push_back(index + j); @@ -201,9 +180,14 @@ NamesAndTypesList ParquetSchemaReader::readSchema() void registerInputFormatParquet(FormatFactory & factory) { factory.registerInputFormat( - "Parquet", - [](ReadBuffer & buf, const Block & sample, const RowInputFormatParams &, const FormatSettings & settings) - { return std::make_shared(buf, sample, settings); }); + "Parquet", + [](ReadBuffer &buf, + const Block &sample, + const RowInputFormatParams &, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, settings); + }); factory.markFormatAsColumnOriented("Parquet"); } @@ -211,7 +195,11 @@ void registerParquetSchemaReader(FormatFactory & factory) { factory.registerSchemaReader( "Parquet", - [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) { return std::make_shared(buf, settings); }); + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + } + ); } } @@ -225,9 +213,7 @@ void registerInputFormatParquet(FormatFactory &) { } -void registerParquetSchemaReader(FormatFactory &) -{ -} +void registerParquetSchemaReader(FormatFactory &) {} } #endif diff --git a/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.json b/tests/queries/0_stateless/data_parquet/case_insensitive_column_matching.parquet.json deleted file mode 100644 index e69de29bb2d..00000000000 From 8561c366be577be25259260f7253b4fe3c1716be Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 23 Mar 2022 16:19:59 +0000 Subject: [PATCH 022/111] Move nested table name --- src/DataTypes/NestedUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index cfacdd252e2..8f5e40de5b8 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -239,7 +239,7 @@ std::unordered_set getAllTableNames(const Block & block, bool to_lower_c boost::to_lower(nested_table_name); if (!nested_table_name.empty()) - nested_table_names.insert(nested_table_name); + nested_table_names.insert(std::move(nested_table_name)); } return nested_table_names; } From 9e6f0ae9f24da27a7bbe8514f0d15af352d5ce4b Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 23 Mar 2022 18:44:09 +0100 Subject: [PATCH 023/111] Fix strip path --- cmake/strip_binary.cmake | 2 +- cmake/tools.cmake | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cmake/strip_binary.cmake b/cmake/strip_binary.cmake index 6d0b7227c54..cbfd8a95c33 100644 --- a/cmake/strip_binary.cmake +++ b/cmake/strip_binary.cmake @@ -21,7 +21,7 @@ macro(clickhouse_strip_binary) COMMAND cp "${STRIP_BINARY_PATH}" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/${STRIP_TARGET}.debug" COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/${STRIP_TARGET}.debug" - COMMAND strip --remove-section=.comment --remove-section=.note "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" + COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" COMMAND "${OBJCOPY_PATH}" --add-gnu-debuglink "${STRIP_DESTINATION_DIR}/lib/debug/${STRIP_TARGET}.debug" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" COMMENT "Stripping clickhouse binary" VERBATIM ) diff --git a/cmake/tools.cmake b/cmake/tools.cmake index d6fddd0509e..d571a46ad26 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -170,32 +170,32 @@ else () message (FATAL_ERROR "Cannot find objcopy.") endif () -# Readelf (FIXME copypaste) +# Strip (FIXME copypaste) if (COMPILER_GCC) - find_program (READELF_PATH NAMES "llvm-readelf" "llvm-readelf-13" "llvm-readelf-12" "llvm-readelf-11" "readelf") + find_program (STRIP_PATH NAMES "llvm-strip" "llvm-strip-13" "llvm-strip-12" "llvm-strip-11" "strip") else () - find_program (READELF_PATH NAMES "llvm-readelf-${COMPILER_VERSION_MAJOR}" "llvm-readelf" "readelf") + find_program (STRIP_PATH NAMES "llvm-strip-${COMPILER_VERSION_MAJOR}" "llvm-strip" "strip") endif () -if (NOT READELF_PATH AND OS_DARWIN) +if (NOT STRIP_PATH AND OS_DARWIN) find_program (BREW_PATH NAMES "brew") if (BREW_PATH) execute_process (COMMAND ${BREW_PATH} --prefix llvm ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE LLVM_PREFIX) if (LLVM_PREFIX) - find_program (READELF_PATH NAMES "llvm-readelf" PATHS "${LLVM_PREFIX}/bin" NO_DEFAULT_PATH) + find_program (STRIP_PATH NAMES "llvm-strip" PATHS "${LLVM_PREFIX}/bin" NO_DEFAULT_PATH) endif () - if (NOT READELF_PATH) + if (NOT STRIP_PATH) execute_process (COMMAND ${BREW_PATH} --prefix binutils ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE BINUTILS_PREFIX) if (BINUTILS_PREFIX) - find_program (READELF_PATH NAMES "readelf" PATHS "${BINUTILS_PREFIX}/bin" NO_DEFAULT_PATH) + find_program (STRIP_PATH NAMES "strip" PATHS "${BINUTILS_PREFIX}/bin" NO_DEFAULT_PATH) endif () endif () endif () endif () -if (READELF_PATH) - message (STATUS "Using readelf: ${READELF_PATH}") +if (STRIP_PATH) + message (STATUS "Using strip: ${STRIP_PATH}") else () - message (FATAL_ERROR "Cannot find readelf.") + message (FATAL_ERROR "Cannot find strip.") endif () From 86a19125247c064717edceaa5578813a790ac2ee Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 23 Mar 2022 18:14:07 +0000 Subject: [PATCH 024/111] Update docs/en/operations/settings/settings.md --- docs/en/operations/settings/settings.md | 30 +++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 56f1e7fe3bb..84454d1a01e 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4207,10 +4207,36 @@ Possible values: - 0 — Disabled. - 1 — Enabled. The wait time equal shutdown_wait_unfinished config. -Default value: 0. +Default value: `0`. ## shutdown_wait_unfinished The waiting time in seconds for currently handled connections when shutdown server. -Default Value: 5. +Default Value: `5`. + +## max_guaranteed_memory_usage + +Maximum guaranteed memory usage for processing of single query. +It represents soft limit in case when hard limit is reached on user level. +Zero means unlimited. +Read more about [memory overcommit](memory-overcommit.md). + +Default value: `0`. + +## memory_usage_overcommit_max_wait_microseconds + +Maximum time thread will wait for memory to be freed in the case of memory overcommit on user level. +If timeout is reached and memory is not freed, exception is thrown. +Read more about [memory overcommit](memory-overcommit.md). + +Default value: `0`. + +## max_guaranteed_memory_usage_for_user + +Maximum guaranteed memory usage for processing all concurrently running queries for the user. +It represents soft limit in case when hard limit is reached on global level. +Zero means unlimited. +Read more about [memory overcommit](memory-overcommit.md). + +Default value: `0`. From 81b2e0bfd9d7c667f53b10c2130b8a72978c7603 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 23 Mar 2022 23:15:08 +0100 Subject: [PATCH 025/111] Fix multiple installation, use a final path for gnu-debuglink --- cmake/strip_binary.cmake | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/strip_binary.cmake b/cmake/strip_binary.cmake index cbfd8a95c33..1547a814913 100644 --- a/cmake/strip_binary.cmake +++ b/cmake/strip_binary.cmake @@ -16,18 +16,18 @@ macro(clickhouse_strip_binary) endif() add_custom_command(TARGET ${STRIP_TARGET} POST_BUILD - COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/lib/debug/" + COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/lib/debug/bin" COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/bin" COMMAND cp "${STRIP_BINARY_PATH}" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" - COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/${STRIP_TARGET}.debug" - COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/${STRIP_TARGET}.debug" + COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" + COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" - COMMAND "${OBJCOPY_PATH}" --add-gnu-debuglink "${STRIP_DESTINATION_DIR}/lib/debug/${STRIP_TARGET}.debug" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" + COMMAND "${OBJCOPY_PATH}" --add-gnu-debuglink "${CMAKE_INSTALL_LIBDIR}/debug/${CMAKE_INSTALL_BINDIR}/${STRIP_TARGET}.debug" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" COMMENT "Stripping clickhouse binary" VERBATIM ) install(PROGRAMS ${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET} DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - install(DIRECTORY ${STRIP_DESTINATION_DIR}/lib/debug DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT clickhouse) + install(FILES ${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug DESTINATION ${CMAKE_INSTALL_LIBDIR}/debug/${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) endmacro() From 6a8bb34b41fbf92ed09b146c1c857425102cbcf6 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 24 Mar 2022 17:17:25 +0800 Subject: [PATCH 026/111] update doc of hive --- docs/en/engines/table-engines/integrations/hive.md | 2 +- docs/zh/engines/table-engines/integrations/hive.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/hive.md b/docs/en/engines/table-engines/integrations/hive.md index b804b9c2279..61147467690 100644 --- a/docs/en/engines/table-engines/integrations/hive.md +++ b/docs/en/engines/table-engines/integrations/hive.md @@ -137,7 +137,7 @@ CREATE TABLE test.test_orc `f_array_array_float` Array(Array(Float32)), `day` String ) -ENGINE = Hive('thrift://202.168.117.26:9083', 'test', 'test_orc') +ENGINE = Hive('thrift://localhost:9083', 'test', 'test_orc') PARTITION BY day ``` diff --git a/docs/zh/engines/table-engines/integrations/hive.md b/docs/zh/engines/table-engines/integrations/hive.md index aa2c82d902a..24e0834d2fc 100644 --- a/docs/zh/engines/table-engines/integrations/hive.md +++ b/docs/zh/engines/table-engines/integrations/hive.md @@ -140,7 +140,7 @@ CREATE TABLE test.test_orc `f_array_array_float` Array(Array(Float32)), `day` String ) -ENGINE = Hive('thrift://202.168.117.26:9083', 'test', 'test_orc') +ENGINE = Hive('thrift://localhost:9083', 'test', 'test_orc') PARTITION BY day ``` From f061e54acbf9c0a9c61e33ff8415ce711f42193d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 25 Feb 2022 12:33:23 +0100 Subject: [PATCH 027/111] Adapt libcxx CMakefiles to LLVM 14 changes --- contrib/libcxx-cmake/CMakeLists.txt | 9 ++++++++- contrib/libcxxabi-cmake/CMakeLists.txt | 25 +++++++++++++------------ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/contrib/libcxx-cmake/CMakeLists.txt b/contrib/libcxx-cmake/CMakeLists.txt index 332fb0411cd..dc9df48b2c1 100644 --- a/contrib/libcxx-cmake/CMakeLists.txt +++ b/contrib/libcxx-cmake/CMakeLists.txt @@ -18,12 +18,14 @@ set(SRCS "${LIBCXX_SOURCE_DIR}/src/filesystem/directory_iterator.cpp" "${LIBCXX_SOURCE_DIR}/src/filesystem/int128_builtins.cpp" "${LIBCXX_SOURCE_DIR}/src/filesystem/operations.cpp" +"${LIBCXX_SOURCE_DIR}/src/format.cpp" "${LIBCXX_SOURCE_DIR}/src/functional.cpp" "${LIBCXX_SOURCE_DIR}/src/future.cpp" "${LIBCXX_SOURCE_DIR}/src/hash.cpp" "${LIBCXX_SOURCE_DIR}/src/ios.cpp" "${LIBCXX_SOURCE_DIR}/src/ios.instantiations.cpp" "${LIBCXX_SOURCE_DIR}/src/iostream.cpp" +"${LIBCXX_SOURCE_DIR}/src/legacy_pointer_safety.cpp" "${LIBCXX_SOURCE_DIR}/src/locale.cpp" "${LIBCXX_SOURCE_DIR}/src/memory.cpp" "${LIBCXX_SOURCE_DIR}/src/mutex.cpp" @@ -33,6 +35,9 @@ set(SRCS "${LIBCXX_SOURCE_DIR}/src/random.cpp" "${LIBCXX_SOURCE_DIR}/src/random_shuffle.cpp" "${LIBCXX_SOURCE_DIR}/src/regex.cpp" +"${LIBCXX_SOURCE_DIR}/src/ryu/d2fixed.cpp" +"${LIBCXX_SOURCE_DIR}/src/ryu/d2s.cpp" +"${LIBCXX_SOURCE_DIR}/src/ryu/f2s.cpp" "${LIBCXX_SOURCE_DIR}/src/shared_mutex.cpp" "${LIBCXX_SOURCE_DIR}/src/stdexcept.cpp" "${LIBCXX_SOURCE_DIR}/src/string.cpp" @@ -49,7 +54,9 @@ set(SRCS add_library(cxx ${SRCS}) set_target_properties(cxx PROPERTIES FOLDER "contrib/libcxx-cmake") -target_include_directories(cxx SYSTEM BEFORE PUBLIC $) +target_include_directories(cxx SYSTEM BEFORE PUBLIC + $ + $/src) target_compile_definitions(cxx PRIVATE -D_LIBCPP_BUILDING_LIBRARY -DLIBCXX_BUILDING_LIBCXXABI) # Enable capturing stack traces for all exceptions. diff --git a/contrib/libcxxabi-cmake/CMakeLists.txt b/contrib/libcxxabi-cmake/CMakeLists.txt index 425111d9b26..bf1ede8a60e 100644 --- a/contrib/libcxxabi-cmake/CMakeLists.txt +++ b/contrib/libcxxabi-cmake/CMakeLists.txt @@ -1,24 +1,24 @@ set(LIBCXXABI_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libcxxabi") set(SRCS -"${LIBCXXABI_SOURCE_DIR}/src/stdlib_stdexcept.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/cxa_virtual.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/cxa_thread_atexit.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/fallback_malloc.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/cxa_guard.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/cxa_default_handlers.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/cxa_personality.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/stdlib_exception.cpp" "${LIBCXXABI_SOURCE_DIR}/src/abort_message.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/cxa_aux_runtime.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/cxa_default_handlers.cpp" "${LIBCXXABI_SOURCE_DIR}/src/cxa_demangle.cpp" "${LIBCXXABI_SOURCE_DIR}/src/cxa_exception.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/cxa_handlers.cpp" "${LIBCXXABI_SOURCE_DIR}/src/cxa_exception_storage.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/private_typeinfo.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/stdlib_typeinfo.cpp" -"${LIBCXXABI_SOURCE_DIR}/src/cxa_aux_runtime.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/cxa_guard.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/cxa_handlers.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/cxa_personality.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/cxa_thread_atexit.cpp" "${LIBCXXABI_SOURCE_DIR}/src/cxa_vector.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/cxa_virtual.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/fallback_malloc.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/private_typeinfo.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/stdlib_exception.cpp" "${LIBCXXABI_SOURCE_DIR}/src/stdlib_new_delete.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/stdlib_stdexcept.cpp" +"${LIBCXXABI_SOURCE_DIR}/src/stdlib_typeinfo.cpp" ) add_library(cxxabi ${SRCS}) @@ -30,6 +30,7 @@ target_compile_options(cxxabi PRIVATE -w) target_include_directories(cxxabi SYSTEM BEFORE PUBLIC $ PRIVATE $ + PRIVATE $ ) target_compile_definitions(cxxabi PRIVATE -D_LIBCPP_BUILDING_LIBRARY) target_compile_options(cxxabi PRIVATE -nostdinc++ -fno-sanitize=undefined -Wno-macro-redefined) # If we don't disable UBSan, infinite recursion happens in dynamic_cast. From 9466e581bb481dae22cd4b6b4111e991891665cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 25 Feb 2022 14:05:26 +0100 Subject: [PATCH 028/111] Temporarily use my forks for libcxx / libcxxabi --- .gitmodules | 4 ++-- contrib/libcxx | 2 +- contrib/libcxxabi | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitmodules b/.gitmodules index 6c9e66f9cbc..e2257630f97 100644 --- a/.gitmodules +++ b/.gitmodules @@ -70,10 +70,10 @@ url = https://github.com/ClickHouse/libgsasl.git [submodule "contrib/libcxx"] path = contrib/libcxx - url = https://github.com/ClickHouse/libcxx.git + url = https://github.com/Algunenano/libcxx.git [submodule "contrib/libcxxabi"] path = contrib/libcxxabi - url = https://github.com/ClickHouse/libcxxabi.git + url = https://github.com/Algunenano/libcxxabi.git [submodule "contrib/snappy"] path = contrib/snappy url = https://github.com/ClickHouse/snappy.git diff --git a/contrib/libcxx b/contrib/libcxx index 61e60294b1d..ae5c11b44e2 160000 --- a/contrib/libcxx +++ b/contrib/libcxx @@ -1 +1 @@ -Subproject commit 61e60294b1de01483caa9f5d00f437c99b674de6 +Subproject commit ae5c11b44e2e66cdb0d0283bc0c1e821e27dcb85 diff --git a/contrib/libcxxabi b/contrib/libcxxabi index df8f1e727db..8106144ae52 160000 --- a/contrib/libcxxabi +++ b/contrib/libcxxabi @@ -1 +1 @@ -Subproject commit df8f1e727dbc9e2bedf2282096fa189dc3fe0076 +Subproject commit 8106144ae5246d21783b80c3a0887b195805b438 From 841bf613bab74426a9bbc8e32da039b9dc1320ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 1 Mar 2022 10:49:51 +0100 Subject: [PATCH 029/111] Apply latest LLVM bug fixes to filesystem remove_all --- contrib/libcxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/libcxx b/contrib/libcxx index ae5c11b44e2..3a4c2689ae8 160000 --- a/contrib/libcxx +++ b/contrib/libcxx @@ -1 +1 @@ -Subproject commit ae5c11b44e2e66cdb0d0283bc0c1e821e27dcb85 +Subproject commit 3a4c2689ae846c16a3210744f8386f99d5e85d17 From 7fd4f060205fc6dbca6526520ac21220e369f304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 1 Mar 2022 14:05:57 +0100 Subject: [PATCH 030/111] Temp fork of hyperscan to fix C++20 issue --- .gitmodules | 2 +- contrib/hyperscan | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index e2257630f97..bb028d63019 100644 --- a/.gitmodules +++ b/.gitmodules @@ -88,7 +88,7 @@ url = https://github.com/ClickHouse/h3 [submodule "contrib/hyperscan"] path = contrib/hyperscan - url = https://github.com/ClickHouse/hyperscan.git + url = https://github.com/Algunenano/hyperscan.git [submodule "contrib/libunwind"] path = contrib/libunwind url = https://github.com/ClickHouse/libunwind.git diff --git a/contrib/hyperscan b/contrib/hyperscan index e9f08df0213..e2ac3060fd5 160000 --- a/contrib/hyperscan +++ b/contrib/hyperscan @@ -1 +1 @@ -Subproject commit e9f08df0213fc637aac0a5bbde9beeaeba2fe9fa +Subproject commit e2ac3060fd5136953980d7599d7194f6779a06f6 From 11a029c96b3f694170032dce915e70e876710087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 10 Mar 2022 18:30:46 +0100 Subject: [PATCH 031/111] Attempt to workaround clang-tidy bug --- .clang-tidy | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.clang-tidy b/.clang-tidy index 0400b500e5c..ca84a4834e5 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -210,3 +210,6 @@ CheckOptions: value: false - key: performance-move-const-arg.CheckTriviallyCopyableMove value: false + # Workaround clang-tidy bug: https://github.com/llvm/llvm-project/issues/46097 + - key: readability-identifier-naming.TypeTemplateParameterIgnoredRegexp + value: expr-type From 9ad485af4cfdbf556c9fb8b08917fc8284da52e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 24 Mar 2022 10:39:55 +0100 Subject: [PATCH 032/111] Update LLVM to llvmorg-14.0.0 / 329fda39c507 --- contrib/libcxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/libcxx b/contrib/libcxx index 3a4c2689ae8..f5374ce9484 160000 --- a/contrib/libcxx +++ b/contrib/libcxx @@ -1 +1 @@ -Subproject commit 3a4c2689ae846c16a3210744f8386f99d5e85d17 +Subproject commit f5374ce9484dc4accff6aa39f307f377cecad906 From 8a5bd2defa98fc7fd805dbb8b90c3ce70d08f877 Mon Sep 17 00:00:00 2001 From: helifu Date: Mon, 21 Mar 2022 11:21:27 +0800 Subject: [PATCH 033/111] Add explicit table info to the scan node of query plan and pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit :) explain plan select * from table1 t1 left join table2 t2 on t1.name = t2.name; ┌─explain──────────────────────────────────────────────────────────────────────────────────────┐ │ Expression ((Projection + Before ORDER BY)) │ │ Join (JOIN) │ │ Expression (Before JOIN) │ │ SettingQuotaAndLimits (Set limits and quota after reading from storage) │ │ ReadFromMergeTree (default.table1) │ │ Expression ((Joined actions + (Rename joined columns + (Projection + Before ORDER BY)))) │ │ SettingQuotaAndLimits (Set limits and quota after reading from storage) │ │ ReadFromMergeTree (default.table2) │ └──────────────────────────────────────────────────────────────────────────────────────────────┘ :) explain pipeline select * from table1 t1 left join table2 t2 on t1.name = t2.name; ┌─explain──────────────────────────────────────────┐ │ (Expression) │ │ ExpressionTransform × 24 │ │ (Join) │ │ JoiningTransform × 24 2 → 1 │ │ Resize 1 → 24 │ │ FillingRightJoinSide │ │ Resize 24 → 1 │ │ (Expression) │ │ ExpressionTransform × 24 │ │ (SettingQuotaAndLimits) │ │ (ReadFromMergeTree default.table1) │ │ MergeTreeThread × 24 0 → 1 │ │ (Expression) │ │ ExpressionTransform × 24 │ │ (SettingQuotaAndLimits) │ │ (ReadFromMergeTree default.table2) │ │ MergeTreeThread × 24 0 → 1 │ └──────────────────────────────────────────────────┘ --- src/Processors/QueryPlan/QueryPlan.cpp | 11 ++++++++++- src/Processors/QueryPlan/ReadFromMergeTree.cpp | 5 +++++ src/Processors/QueryPlan/ReadFromMergeTree.h | 3 ++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/Processors/QueryPlan/QueryPlan.cpp b/src/Processors/QueryPlan/QueryPlan.cpp index d948c16a78d..134f0e624c4 100644 --- a/src/Processors/QueryPlan/QueryPlan.cpp +++ b/src/Processors/QueryPlan/QueryPlan.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -387,7 +388,15 @@ void QueryPlan::explainPlan(WriteBuffer & buffer, const ExplainPlanOptions & opt static void explainPipelineStep(IQueryPlanStep & step, IQueryPlanStep::FormatSettings & settings) { - settings.out << String(settings.offset, settings.indent_char) << "(" << step.getName() << ")\n"; + // Add explicit description to the scan node of pipeline. + if (ReadFromMergeTree::READ_FROM_MERGETREE_NAME == step.getName()) + { + settings.out << String(settings.offset, settings.indent_char) << "(" << step.getName() << " " << step.getStepDescription() << ")\n"; + } + else + { + settings.out << String(settings.offset, settings.indent_char) << "(" << step.getName() << ")\n"; + } size_t current_offset = settings.offset; step.describePipeline(settings); if (current_offset == settings.offset) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 1bfc1ec7306..edee3eac1d2 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -45,6 +45,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +const std::string ReadFromMergeTree::READ_FROM_MERGETREE_NAME = "ReadFromMergeTree"; + static MergeTreeReaderSettings getMergeTreeReaderSettings(const ContextPtr & context) { const auto & settings = context->getSettingsRef(); @@ -112,6 +114,9 @@ ReadFromMergeTree::ReadFromMergeTree( if (enable_parallel_reading) read_task_callback = context->getMergeTreeReadTaskCallback(); + + /// Add explicit description. + setStepDescription(data.getStorageID().getFullNameNotQuoted()); } Pipe ReadFromMergeTree::readFromPool( diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index 685b99a7bdc..80a6fa57f32 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -100,7 +100,8 @@ public: bool enable_parallel_reading ); - String getName() const override { return "ReadFromMergeTree"; } + static const std::string READ_FROM_MERGETREE_NAME; + String getName() const override { return READ_FROM_MERGETREE_NAME; } void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; From b3c021ec97d3fa77252bf81fecab3b055fddd177 Mon Sep 17 00:00:00 2001 From: tavplubix Date: Thu, 24 Mar 2022 13:53:06 +0300 Subject: [PATCH 034/111] Update test.py --- tests/integration/test_s3_zero_copy_replication/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py index d7aa4feb1d2..22334b0803e 100644 --- a/tests/integration/test_s3_zero_copy_replication/test.py +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -361,6 +361,7 @@ def test_s3_zero_copy_with_ttl_delete(cluster, large_data, iterations): ) node1.query("OPTIMIZE TABLE ttl_delete_test FINAL") + node1.query("SYSTEM SYNC REPLICA ttl_delete_test") node2.query("SYSTEM SYNC REPLICA ttl_delete_test") if large_data: From fbba450cffbcfb4903e0dd2a3a7cf2584a9f2452 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 24 Mar 2022 11:23:18 +0000 Subject: [PATCH 035/111] Take ELSE branch into account for result type deduction --- src/Functions/caseWithExpression.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Functions/caseWithExpression.cpp b/src/Functions/caseWithExpression.cpp index 37ee89c1f11..e06a01431da 100644 --- a/src/Functions/caseWithExpression.cpp +++ b/src/Functions/caseWithExpression.cpp @@ -43,6 +43,9 @@ public: for (size_t i = 2; i < args.size() - 1; i += 2) dst_array_types.push_back(args[i]); + // Type of the ELSE branch + dst_array_types.push_back(args.back()); + return getLeastSupertype(dst_array_types); } From 7a25fc612d94b8e318747ef9a6e671620fd91799 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 24 Mar 2022 11:28:38 +0000 Subject: [PATCH 036/111] Add tests for caseWithExpression --- ...4_casewithexpression_return_type.reference | 20 +++++++++++++++++++ .../02244_casewithexpression_return_type.sql | 12 +++++++++++ 2 files changed, 32 insertions(+) create mode 100644 tests/queries/0_stateless/02244_casewithexpression_return_type.reference create mode 100644 tests/queries/0_stateless/02244_casewithexpression_return_type.sql diff --git a/tests/queries/0_stateless/02244_casewithexpression_return_type.reference b/tests/queries/0_stateless/02244_casewithexpression_return_type.reference new file mode 100644 index 00000000000..bcdeb4a290b --- /dev/null +++ b/tests/queries/0_stateless/02244_casewithexpression_return_type.reference @@ -0,0 +1,20 @@ +0 555555 +1 10 +2 555555 +3 55 +4 555555 +5 555555 +6 77 +7 555555 +8 555555 +9 95 +10 100 +11 555555 +12 555555 +13 555555 +14 555555 +15 555555 +16 555555 +17 555555 +18 555555 +19 555555 diff --git a/tests/queries/0_stateless/02244_casewithexpression_return_type.sql b/tests/queries/0_stateless/02244_casewithexpression_return_type.sql new file mode 100644 index 00000000000..02557a3ddfa --- /dev/null +++ b/tests/queries/0_stateless/02244_casewithexpression_return_type.sql @@ -0,0 +1,12 @@ + SELECT "number", CASE "number" + WHEN 3 THEN 55 + WHEN 6 THEN 77 + WHEN 9 THEN 95 + ELSE CASE + WHEN "number"=1 THEN 10 + WHEN "number"=10 THEN 100 + ELSE 555555 + END + END AS "LONG_COL_0" + FROM `system`.numbers + LIMIT 20; From d16ae465896182975704d96f6d3cdad282bb9d6a Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 24 Mar 2022 11:31:52 +0000 Subject: [PATCH 037/111] remove description for ReadFromMergeTree from pipeline, adjust tests for plan --- src/Processors/QueryPlan/QueryPlan.cpp | 11 ++--------- src/Processors/QueryPlan/ReadFromMergeTree.cpp | 2 -- src/Processors/QueryPlan/ReadFromMergeTree.h | 4 ++-- ...timize_monotonous_functions_in_order_by.reference | 6 +++--- .../0_stateless/01576_alias_column_rewrite.reference | 12 ++++++------ .../0_stateless/01786_explain_merge_tree.reference | 9 +++++---- 6 files changed, 18 insertions(+), 26 deletions(-) diff --git a/src/Processors/QueryPlan/QueryPlan.cpp b/src/Processors/QueryPlan/QueryPlan.cpp index 134f0e624c4..b98cd82bbbf 100644 --- a/src/Processors/QueryPlan/QueryPlan.cpp +++ b/src/Processors/QueryPlan/QueryPlan.cpp @@ -388,15 +388,8 @@ void QueryPlan::explainPlan(WriteBuffer & buffer, const ExplainPlanOptions & opt static void explainPipelineStep(IQueryPlanStep & step, IQueryPlanStep::FormatSettings & settings) { - // Add explicit description to the scan node of pipeline. - if (ReadFromMergeTree::READ_FROM_MERGETREE_NAME == step.getName()) - { - settings.out << String(settings.offset, settings.indent_char) << "(" << step.getName() << " " << step.getStepDescription() << ")\n"; - } - else - { - settings.out << String(settings.offset, settings.indent_char) << "(" << step.getName() << ")\n"; - } + settings.out << String(settings.offset, settings.indent_char) << "(" << step.getName() << ")\n"; + size_t current_offset = settings.offset; step.describePipeline(settings); if (current_offset == settings.offset) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index edee3eac1d2..e1b099e44c3 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -45,8 +45,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -const std::string ReadFromMergeTree::READ_FROM_MERGETREE_NAME = "ReadFromMergeTree"; - static MergeTreeReaderSettings getMergeTreeReaderSettings(const ContextPtr & context) { const auto & settings = context->getSettingsRef(); diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index 80a6fa57f32..6846506f260 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -100,8 +100,8 @@ public: bool enable_parallel_reading ); - static const std::string READ_FROM_MERGETREE_NAME; - String getName() const override { return READ_FROM_MERGETREE_NAME; } + static constexpr auto name = "ReadFromMergeTree"; + String getName() const override { return name; } void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; diff --git a/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.reference b/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.reference index d8c9b88d8e8..46aaa6e07d6 100644 --- a/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.reference +++ b/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.reference @@ -9,7 +9,7 @@ Expression (Projection) Sorting (Sorting for ORDER BY) Expression (Before ORDER BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree + ReadFromMergeTree (default.test_order_by) SELECT timestamp, key @@ -21,7 +21,7 @@ Expression (Projection) Sorting Expression (Before ORDER BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree + ReadFromMergeTree (default.test_order_by) SELECT timestamp, key @@ -35,7 +35,7 @@ Expression (Projection) Sorting Expression (Before ORDER BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree + ReadFromMergeTree (default.test_order_by) SELECT timestamp, key diff --git a/tests/queries/0_stateless/01576_alias_column_rewrite.reference b/tests/queries/0_stateless/01576_alias_column_rewrite.reference index 07d361cfa46..11cc146dd62 100644 --- a/tests/queries/0_stateless/01576_alias_column_rewrite.reference +++ b/tests/queries/0_stateless/01576_alias_column_rewrite.reference @@ -26,35 +26,35 @@ Expression (Projection) Sorting (Sorting for ORDER BY) Expression (Before ORDER BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree + ReadFromMergeTree (default.test_table) Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting Expression (Before ORDER BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree + ReadFromMergeTree (default.test_table) Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting Expression (Before ORDER BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree + ReadFromMergeTree (default.test_table) optimize_aggregation_in_order Expression ((Projection + Before ORDER BY)) Aggregating Expression (Before GROUP BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree + ReadFromMergeTree (default.test_table) Expression ((Projection + Before ORDER BY)) Aggregating Expression (Before GROUP BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree + ReadFromMergeTree (default.test_table) Expression ((Projection + Before ORDER BY)) Aggregating Expression (Before GROUP BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree + ReadFromMergeTree (default.test_table) second-index 1 1 diff --git a/tests/queries/0_stateless/01786_explain_merge_tree.reference b/tests/queries/0_stateless/01786_explain_merge_tree.reference index 9b2df9773ea..25c7c37beca 100644 --- a/tests/queries/0_stateless/01786_explain_merge_tree.reference +++ b/tests/queries/0_stateless/01786_explain_merge_tree.reference @@ -1,4 +1,4 @@ - ReadFromMergeTree + ReadFromMergeTree (default.test_index) Indexes: MinMax Keys: @@ -32,6 +32,7 @@ Granules: 1/2 ----------------- "Node Type": "ReadFromMergeTree", + "Description": "default.test_index", "Indexes": [ { "Type": "MinMax", @@ -89,16 +90,16 @@ } ] ----------------- - ReadFromMergeTree + ReadFromMergeTree (default.test_index) ReadType: InOrder Parts: 1 Granules: 3 ----------------- - ReadFromMergeTree + ReadFromMergeTree (default.test_index) ReadType: InReverseOrder Parts: 1 Granules: 3 - ReadFromMergeTree + ReadFromMergeTree (default.idx) Indexes: PrimaryKey Keys: From f106e2dd492b80201067679ec89359744898a6b3 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 24 Mar 2022 11:53:58 +0000 Subject: [PATCH 038/111] fix style in QueryPlan.cpp --- src/Processors/QueryPlan/QueryPlan.cpp | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/Processors/QueryPlan/QueryPlan.cpp b/src/Processors/QueryPlan/QueryPlan.cpp index b98cd82bbbf..b2305d9aab2 100644 --- a/src/Processors/QueryPlan/QueryPlan.cpp +++ b/src/Processors/QueryPlan/QueryPlan.cpp @@ -1,17 +1,22 @@ -#include -#include -#include -#include -#include -#include +#include + +#include + #include #include -#include + +#include +#include + +#include +#include #include #include -#include +#include #include -#include + +#include + namespace DB { From 37286c6141aba20e62d3852b5e2bf5edf1533f67 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 24 Mar 2022 12:55:56 +0100 Subject: [PATCH 039/111] Increase fiber stack size a bit in attempt to fix stack overflow in tests with address sanitizer --- src/Common/FiberStack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/FiberStack.h b/src/Common/FiberStack.h index 29f84ee6d85..2cc301dcc62 100644 --- a/src/Common/FiberStack.h +++ b/src/Common/FiberStack.h @@ -32,7 +32,7 @@ public: /// /// Current value is just enough for all tests in our CI. It's not selected in some special /// way. We will have 40 pages with 4KB page size. - static constexpr size_t default_stack_size = 192 * 1024; /// 64KB was not enough for tests + static constexpr size_t default_stack_size = 256 * 1024; /// 64KB was not enough for tests explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_) { From a4e8e940bc166744a8026d517d647527196def88 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 24 Mar 2022 13:45:04 +0100 Subject: [PATCH 040/111] Fixed tests --- tests/queries/0_stateless/02118_deserialize_whole_text.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/02118_deserialize_whole_text.sh b/tests/queries/0_stateless/02118_deserialize_whole_text.sh index fe9256df329..e9f35582f15 100755 --- a/tests/queries/0_stateless/02118_deserialize_whole_text.sh +++ b/tests/queries/0_stateless/02118_deserialize_whole_text.sh @@ -41,16 +41,16 @@ $CLICKHOUSE_CLIENT -q "SELECT * FROM file('data_02118', 'CSV', 'x IPv4')" 2>&1 | echo "[\"255.255.255.255trash\"]" > $DATA_FILE $CLICKHOUSE_CLIENT -q "SELECT * FROM file('data_02118', 'JSONCompactEachRow', 'x IPv4')" 2>&1 | grep -F -q "UNEXPECTED_DATA_AFTER_PARSED_VALUE" && echo 'OK' || echo 'FAIL' -echo "[\"2a02:6ba8:2da1:40cd:31db:f9f1:fc3d:80b1trash\"]" > $DATA_FILE +echo "[\"0000:0000:0000:0000:0000:ffff:192.168.100.228b1trash\"]" > $DATA_FILE $CLICKHOUSE_CLIENT -q "SELECT * FROM file('data_02118', 'JSONCompactStringsEachRow', 'x IPv6')" 2>&1 | grep -F -q "UNEXPECTED_DATA_AFTER_PARSED_VALUE" && echo 'OK' || echo 'FAIL' -echo "2a02:6ba8:2da1:40cd:31db:f9f1:fc3d:80b1trash" > $DATA_FILE +echo "0000:0000:0000:0000:0000:ffff:192.168.100.228b1trash" > $DATA_FILE $CLICKHOUSE_CLIENT -q "SELECT * FROM file('data_02118', 'TSV', 'x IPv6')" 2>&1 | grep -F -q "UNEXPECTED_DATA_AFTER_PARSED_VALUE" && echo 'OK' || echo 'FAIL' -echo "2a02:6ba8:2da1:40cd:31db:f9f1:fc3d:80b1trash" > $DATA_FILE +echo "0000:0000:0000:0000:0000:ffff:192.168.100.228b1trash" > $DATA_FILE $CLICKHOUSE_CLIENT -q "SELECT * FROM file('data_02118', 'CSV', 'x IPv6')" 2>&1 | grep -F -q "UNEXPECTED_DATA_AFTER_PARSED_VALUE" && echo 'OK' || echo 'FAIL' -echo "[\"2a02:6ba8:2da1:40cd:31db:f9f1:fc3d:80b1trash\"]" > $DATA_FILE +echo "[\"0000:0000:0000:0000:0000:ffff:192.168.100.228b1trash\"]" > $DATA_FILE $CLICKHOUSE_CLIENT -q "SELECT * FROM file('data_02118', 'JSONCompactEachRow', 'x IPv6')" 2>&1 | grep -F -q "UNEXPECTED_DATA_AFTER_PARSED_VALUE" && echo 'OK' || echo 'FAIL' echo "[\"{1:2, 2:3}trash\"]" > $DATA_FILE From 98be162d20e4b936146c298fa78d990bc64016c9 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 24 Mar 2022 13:49:26 +0100 Subject: [PATCH 041/111] Update comment --- src/Common/FiberStack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/FiberStack.h b/src/Common/FiberStack.h index 2cc301dcc62..d55b0fa691c 100644 --- a/src/Common/FiberStack.h +++ b/src/Common/FiberStack.h @@ -31,7 +31,7 @@ public: /// probably it worth to try to increase stack size for coroutines. /// /// Current value is just enough for all tests in our CI. It's not selected in some special - /// way. We will have 40 pages with 4KB page size. + /// way. We will have 64 pages with 4KB page size. static constexpr size_t default_stack_size = 256 * 1024; /// 64KB was not enough for tests explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_) From 34b2307259d475d4d6acae01e65f5f158261bc6e Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 24 Mar 2022 13:56:00 +0000 Subject: [PATCH 042/111] Convert columns to full when multiple disjunct used --- src/Interpreters/HashJoin.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index e81db1427ef..f55ca0dac5a 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -284,6 +284,7 @@ HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_s } JoinCommon::convertToFullColumnsInplace(right_table_keys); + JoinCommon::convertToFullColumnsInplace(sample_block_with_columns_to_add); initRightBlockStructure(data->sample_block); JoinCommon::createMissedColumns(sample_block_with_columns_to_add); From 3516eb05246a3b8cdc1967baffcfeae186349846 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 24 Mar 2022 15:37:53 +0100 Subject: [PATCH 043/111] Make GITHUB_RUN_URL variable and use it --- tests/ci/ast_fuzzer_check.py | 9 ++++----- tests/ci/build_report_check.py | 6 ++---- tests/ci/env_helper.py | 3 ++- tests/ci/finish_check.py | 4 ++-- tests/ci/performance_comparison_check.py | 9 +++++---- tests/ci/pr_info.py | 8 ++++---- tests/ci/run_check.py | 4 ++-- tests/ci/upload_result_helper.py | 4 ++-- 8 files changed, 23 insertions(+), 24 deletions(-) diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py index c330d1c725b..94f5eff51d7 100644 --- a/tests/ci/ast_fuzzer_check.py +++ b/tests/ci/ast_fuzzer_check.py @@ -9,11 +9,10 @@ from github import Github from env_helper import ( GITHUB_REPOSITORY, - TEMP_PATH, - REPO_COPY, + GITHUB_RUN_URL, REPORTS_PATH, - GITHUB_SERVER_URL, - GITHUB_RUN_ID, + REPO_COPY, + TEMP_PATH, ) from s3_helper import S3Helper from get_robot_token import get_best_robot_token @@ -126,7 +125,7 @@ if __name__ == "__main__": logging.info("Exception uploading file %s text %s", f, ex) paths[f] = "" - report_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}" + report_url = GITHUB_RUN_URL if paths["runlog.log"]: report_url = paths["runlog.log"] if paths["main.log"]: diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index 1cee5fd42de..5afe2991073 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -11,7 +11,7 @@ from env_helper import ( TEMP_PATH, GITHUB_REPOSITORY, GITHUB_SERVER_URL, - GITHUB_RUN_ID, + GITHUB_RUN_URL, ) from report import create_build_html_report from s3_helper import S3Helper @@ -180,9 +180,7 @@ if __name__ == "__main__": branch_name = "PR #{}".format(pr_info.number) branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/pull/{pr_info.number}" commit_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commit/{pr_info.sha}" - task_url = ( - f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID or '0'}" - ) + task_url = GITHUB_RUN_URL report = create_build_html_report( build_check_name, build_results, diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py index 90178e5c56a..c34162ba51a 100644 --- a/tests/ci/env_helper.py +++ b/tests/ci/env_helper.py @@ -7,9 +7,10 @@ CACHES_PATH = os.getenv("CACHES_PATH", TEMP_PATH) CLOUDFLARE_TOKEN = os.getenv("CLOUDFLARE_TOKEN") GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH") GITHUB_REPOSITORY = os.getenv("GITHUB_REPOSITORY", "ClickHouse/ClickHouse") -GITHUB_RUN_ID = os.getenv("GITHUB_RUN_ID") +GITHUB_RUN_ID = os.getenv("GITHUB_RUN_ID", "0") GITHUB_SERVER_URL = os.getenv("GITHUB_SERVER_URL", "https://github.com") GITHUB_WORKSPACE = os.getenv("GITHUB_WORKSPACE", os.path.abspath("../../")) +GITHUB_RUN_URL = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}" IMAGES_PATH = os.getenv("IMAGES_PATH") REPORTS_PATH = os.getenv("REPORTS_PATH", "./reports") REPO_COPY = os.getenv("REPO_COPY", os.path.abspath("../../")) diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index 79cea83b1c8..289e32406ef 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -2,7 +2,7 @@ import logging from github import Github -from env_helper import GITHUB_SERVER_URL, GITHUB_REPOSITORY, GITHUB_RUN_ID +from env_helper import GITHUB_RUN_URL from pr_info import PRInfo from get_robot_token import get_best_robot_token from commit_status_helper import get_commit @@ -33,7 +33,7 @@ if __name__ == "__main__": gh = Github(get_best_robot_token()) commit = get_commit(gh, pr_info.sha) - url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}" + url = GITHUB_RUN_URL statuses = filter_statuses(list(commit.get_statuses())) if NAME in statuses and statuses[NAME].state == "pending": commit.create_status( diff --git a/tests/ci/performance_comparison_check.py b/tests/ci/performance_comparison_check.py index 761b1ac9257..c6ce86b2ce1 100644 --- a/tests/ci/performance_comparison_check.py +++ b/tests/ci/performance_comparison_check.py @@ -11,6 +11,7 @@ import re from github import Github +from env_helper import GITHUB_RUN_URL from pr_info import PRInfo from s3_helper import S3Helper from get_robot_token import get_best_robot_token @@ -88,9 +89,9 @@ if __name__ == "__main__": else: pr_link = f"https://github.com/ClickHouse/ClickHouse/pull/{pr_info.number}" - task_url = f"https://github.com/ClickHouse/ClickHouse/actions/runs/{os.getenv('GITHUB_RUN_ID')}" - docker_env += ' -e CHPC_ADD_REPORT_LINKS="Job (actions) Tested commit"'.format( - task_url, pr_link + docker_env += ( + f' -e CHPC_ADD_REPORT_LINKS="' + f'Job (actions) Tested commit"' ) if "RUN_BY_HASH_TOTAL" in os.environ: @@ -199,7 +200,7 @@ if __name__ == "__main__": status = "failure" message = "No message in report." - report_url = task_url + report_url = GITHUB_RUN_URL if paths["runlog.log"]: report_url = paths["runlog.log"] diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index 64e22712059..ee4399792ae 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -8,7 +8,7 @@ from build_download_helper import get_with_retries from env_helper import ( GITHUB_REPOSITORY, GITHUB_SERVER_URL, - GITHUB_RUN_ID, + GITHUB_RUN_URL, GITHUB_EVENT_PATH, ) @@ -111,7 +111,7 @@ class PRInfo: self.sha = github_event["pull_request"]["head"]["sha"] repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}" - self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}" + self.task_url = GITHUB_RUN_URL self.repo_full_name = GITHUB_REPOSITORY self.commit_html_url = f"{repo_prefix}/commits/{self.sha}" @@ -142,7 +142,7 @@ class PRInfo: self.sha = github_event["after"] pull_request = get_pr_for_commit(self.sha, github_event["ref"]) repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}" - self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}" + self.task_url = GITHUB_RUN_URL self.commit_html_url = f"{repo_prefix}/commits/{self.sha}" self.repo_full_name = GITHUB_REPOSITORY if pull_request is None or pull_request["state"] == "closed": @@ -180,7 +180,7 @@ class PRInfo: self.number = 0 self.labels = {} repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}" - self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}" + self.task_url = GITHUB_RUN_URL self.commit_html_url = f"{repo_prefix}/commits/{self.sha}" self.repo_full_name = GITHUB_REPOSITORY self.pr_html_url = f"{repo_prefix}/commits/{ref}" diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 5b89082532d..9c7ba13f8e4 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -5,7 +5,7 @@ import re from typing import Tuple from github import Github -from env_helper import GITHUB_RUN_ID, GITHUB_REPOSITORY, GITHUB_SERVER_URL +from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL from pr_info import PRInfo from get_robot_token import get_best_robot_token from commit_status_helper import get_commit @@ -231,7 +231,7 @@ if __name__ == "__main__": ) sys.exit(1) - url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}" + url = GITHUB_RUN_URL if not can_run: print("::notice ::Cannot run") commit.create_status( diff --git a/tests/ci/upload_result_helper.py b/tests/ci/upload_result_helper.py index f7b74e8d5dd..289fc4b3184 100644 --- a/tests/ci/upload_result_helper.py +++ b/tests/ci/upload_result_helper.py @@ -2,7 +2,7 @@ import os import logging import ast -from env_helper import GITHUB_SERVER_URL, GITHUB_REPOSITORY, GITHUB_RUN_ID +from env_helper import GITHUB_SERVER_URL, GITHUB_REPOSITORY, GITHUB_RUN_URL from report import ReportColorTheme, create_test_html_report @@ -66,7 +66,7 @@ def upload_results( branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/pull/{pr_number}" commit_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commit/{commit_sha}" - task_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}" + task_url = GITHUB_RUN_URL if additional_urls: raw_log_url = additional_urls[0] From f07918c6590792ec3137219ca641305fd9d53a0f Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 24 Mar 2022 15:50:32 +0100 Subject: [PATCH 044/111] Increase stack size --- src/Common/FiberStack.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/FiberStack.h b/src/Common/FiberStack.h index d55b0fa691c..c55608311d0 100644 --- a/src/Common/FiberStack.h +++ b/src/Common/FiberStack.h @@ -31,8 +31,8 @@ public: /// probably it worth to try to increase stack size for coroutines. /// /// Current value is just enough for all tests in our CI. It's not selected in some special - /// way. We will have 64 pages with 4KB page size. - static constexpr size_t default_stack_size = 256 * 1024; /// 64KB was not enough for tests + /// way. We will have 80 pages with 4KB page size. + static constexpr size_t default_stack_size = 320 * 1024; /// 64KB was not enough for tests explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_) { From d6fc6b9c447d307cf359e35698bd90075a2f1bfc Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 24 Mar 2022 15:40:52 +0100 Subject: [PATCH 045/111] Add build-url label to built docker images --- tests/ci/docker_images_check.py | 3 ++- tests/ci/docker_test.py | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index 818478f6430..3d0cc468aec 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -11,7 +11,7 @@ from typing import Dict, List, Optional, Set, Tuple, Union from github import Github -from env_helper import GITHUB_WORKSPACE, RUNNER_TEMP +from env_helper import GITHUB_WORKSPACE, RUNNER_TEMP, GITHUB_RUN_URL from s3_helper import S3Helper from pr_info import PRInfo from get_robot_token import get_best_robot_token, get_parameter_from_ssm @@ -234,6 +234,7 @@ def build_and_push_one_image( with open(build_log, "wb") as bl: cmd = ( "docker buildx build --builder default " + f"--label build-url={GITHUB_RUN_URL} " f"{from_tag_arg}" f"--build-arg BUILDKIT_INLINE_CACHE=1 " f"--tag {image.repo}:{version_string} " diff --git a/tests/ci/docker_test.py b/tests/ci/docker_test.py index 27bfe07db53..2b864b6b94c 100644 --- a/tests/ci/docker_test.py +++ b/tests/ci/docker_test.py @@ -4,6 +4,7 @@ import os import unittest from unittest.mock import patch +from env_helper import GITHUB_RUN_URL from pr_info import PRInfo import docker_images_check as di @@ -117,7 +118,8 @@ class TestDockerImageCheck(unittest.TestCase): mock_popen.assert_called_once() mock_machine.assert_not_called() self.assertIn( - "docker buildx build --builder default --build-arg FROM_TAG=version " + f"docker buildx build --builder default --label build-url={GITHUB_RUN_URL} " + "--build-arg FROM_TAG=version " "--build-arg BUILDKIT_INLINE_CACHE=1 --tag name:version --cache-from " "type=registry,ref=name:version --push --progress plain path", mock_popen.call_args.args, @@ -133,7 +135,8 @@ class TestDockerImageCheck(unittest.TestCase): mock_popen.assert_called_once() mock_machine.assert_not_called() self.assertIn( - "docker buildx build --builder default --build-arg FROM_TAG=version2 " + f"docker buildx build --builder default --label build-url={GITHUB_RUN_URL} " + "--build-arg FROM_TAG=version2 " "--build-arg BUILDKIT_INLINE_CACHE=1 --tag name:version2 --cache-from " "type=registry,ref=name:version2 --progress plain path", mock_popen.call_args.args, @@ -149,7 +152,7 @@ class TestDockerImageCheck(unittest.TestCase): mock_popen.assert_called_once() mock_machine.assert_not_called() self.assertIn( - "docker buildx build --builder default " + f"docker buildx build --builder default --label build-url={GITHUB_RUN_URL} " "--build-arg BUILDKIT_INLINE_CACHE=1 --tag name:version2 --cache-from " "type=registry,ref=name:version2 --progress plain path", mock_popen.call_args.args, From b2863d4cea139382730897bd11855e92f48cb1ae Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 24 Mar 2022 15:49:06 +0100 Subject: [PATCH 046/111] Rebuild docs-check for a test --- docker/docs/check/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/docs/check/Dockerfile b/docker/docs/check/Dockerfile index 174be123eed..4eb03a91e7a 100644 --- a/docker/docs/check/Dockerfile +++ b/docker/docs/check/Dockerfile @@ -1,4 +1,3 @@ -# rebuild in #33610 # docker build -t clickhouse/docs-check . ARG FROM_TAG=latest FROM clickhouse/docs-builder:$FROM_TAG From e2f4546f85d7cda7b0fcc93bf616a24dafb7bfc4 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 24 Mar 2022 16:59:09 +0100 Subject: [PATCH 047/111] Pin jinja2 to 3.0.3 to have working jinja2.contextfilter --- docs/tools/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt index 8bf1a5f477c..c48a70b0909 100644 --- a/docs/tools/requirements.txt +++ b/docs/tools/requirements.txt @@ -10,7 +10,7 @@ cssmin==0.2.0 future==0.18.2 htmlmin==0.1.12 idna==2.10 -Jinja2>=3.0.3 +Jinja2==3.0.3 jinja2-highlight==0.6.1 jsmin==3.0.0 livereload==2.6.3 From a60e5bf427d0eb5744fd0a53496ee977d218da99 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 24 Mar 2022 17:47:37 +0100 Subject: [PATCH 048/111] Update docs/en/operations/settings/settings.md Co-authored-by: alesapin --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 84454d1a01e..e9f195ddd51 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4227,7 +4227,7 @@ Default value: `0`. ## memory_usage_overcommit_max_wait_microseconds Maximum time thread will wait for memory to be freed in the case of memory overcommit on user level. -If timeout is reached and memory is not freed, exception is thrown. +If the timeout is reached and memory is not freed, an exception is thrown. Read more about [memory overcommit](memory-overcommit.md). Default value: `0`. From 56d20e00cc362d178ac5636e84e64a2a48717134 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 24 Mar 2022 17:47:48 +0100 Subject: [PATCH 049/111] Update src/Common/OvercommitTracker.cpp Co-authored-by: alesapin --- src/Common/OvercommitTracker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/OvercommitTracker.cpp b/src/Common/OvercommitTracker.cpp index b0d60b1c25c..7b03b9f271d 100644 --- a/src/Common/OvercommitTracker.cpp +++ b/src/Common/OvercommitTracker.cpp @@ -23,7 +23,7 @@ void OvercommitTracker::setMaxWaitTime(UInt64 wait_time) bool OvercommitTracker::needToStopQuery(MemoryTracker * tracker) { - // NOTE: DO NOT CHANGE THE ORDER OF LOCKS + // NOTE: Do not change the order of locks // // global_mutex must be acquired before overcommit_m, because // method OvercommitTracker::unsubscribe(MemoryTracker *) is From 581bcea1f066a99682e7ebedd76d910d6b5a6f41 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 24 Mar 2022 17:47:59 +0100 Subject: [PATCH 050/111] Update docs/en/operations/settings/settings.md Co-authored-by: alesapin --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index e9f195ddd51..4aee8b96600 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4226,7 +4226,7 @@ Default value: `0`. ## memory_usage_overcommit_max_wait_microseconds -Maximum time thread will wait for memory to be freed in the case of memory overcommit on user level. +Maximum time thread will wait for memory to be freed in the case of memory overcommit on a user level. If the timeout is reached and memory is not freed, an exception is thrown. Read more about [memory overcommit](memory-overcommit.md). From 9488c5b2fd0cc51ed984c265b9bec73beb8f73ba Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 24 Mar 2022 17:48:11 +0100 Subject: [PATCH 051/111] Update docs/en/operations/settings/memory-overcommit.md Co-authored-by: alesapin --- docs/en/operations/settings/memory-overcommit.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/memory-overcommit.md b/docs/en/operations/settings/memory-overcommit.md index e36678de3df..c0dfc094ece 100644 --- a/docs/en/operations/settings/memory-overcommit.md +++ b/docs/en/operations/settings/memory-overcommit.md @@ -2,7 +2,7 @@ Memory overcommit is an experimental technique intended to allow to set more flexible memory limits for queries. -The idea of this technique is to introduce settings which can represent guaranteed memory amount of memory a query can use. +The idea of this technique is to introduce settings which can represent guaranteed amount of memory a query can use. When memory overcommit is enabled and memory limit is reached ClickHouse will select the most overcommit query and try to free memory by killing this query. When memory limit is reached any query will wait some time during atempt to allocate new memory. From d90627e82fd9f01e9ec3e41a753d6ffd92967d51 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 24 Mar 2022 17:50:31 +0100 Subject: [PATCH 052/111] Fix version string update, fix #35518 --- tests/ci/version_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/version_helper.py b/tests/ci/version_helper.py index 02e22ee0c4d..3bb547333e7 100755 --- a/tests/ci/version_helper.py +++ b/tests/ci/version_helper.py @@ -238,7 +238,7 @@ def _update_dockerfile(repo_path: str, version: ClickHouseVersion): def update_version_local(repo_path, version, version_type="testing"): update_contributors() version.with_description(version_type) - update_cmake_version(version, version_type) + update_cmake_version(version) _update_changelog(repo_path, version) _update_dockerfile(repo_path, version) From 1b1a624df022b0c9edd64470f229b5510ed136b3 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 24 Mar 2022 18:20:23 +0100 Subject: [PATCH 053/111] Update docs/en/operations/settings/memory-overcommit.md Co-authored-by: alesapin --- docs/en/operations/settings/memory-overcommit.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/memory-overcommit.md b/docs/en/operations/settings/memory-overcommit.md index c0dfc094ece..bae81c0c238 100644 --- a/docs/en/operations/settings/memory-overcommit.md +++ b/docs/en/operations/settings/memory-overcommit.md @@ -8,7 +8,7 @@ When memory overcommit is enabled and memory limit is reached ClickHouse will se When memory limit is reached any query will wait some time during atempt to allocate new memory. If selected query is killed and memory is freed within waiting timeout, query will continue execution after waiting, otherwise it'll be killed too. -Selection of query to stop is performed by either global or user overcommit trackers depending on what memory limit is reached. +Selection of query to stop or kill is performed by either global or user overcommit trackers depending on what memory limit is reached. ## User overcommit tracker From 647068488013deb58c9dafddb72a22edf4fa64f6 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 24 Mar 2022 18:20:50 +0100 Subject: [PATCH 054/111] Update docs/en/operations/settings/memory-overcommit.md Co-authored-by: alesapin --- docs/en/operations/settings/memory-overcommit.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/memory-overcommit.md b/docs/en/operations/settings/memory-overcommit.md index bae81c0c238..f7d2bd0e239 100644 --- a/docs/en/operations/settings/memory-overcommit.md +++ b/docs/en/operations/settings/memory-overcommit.md @@ -3,7 +3,7 @@ Memory overcommit is an experimental technique intended to allow to set more flexible memory limits for queries. The idea of this technique is to introduce settings which can represent guaranteed amount of memory a query can use. -When memory overcommit is enabled and memory limit is reached ClickHouse will select the most overcommit query and try to free memory by killing this query. +When memory overcommit is enabled and the memory limit is reached ClickHouse will select the most overcommitted query and try to free memory by killing this query. When memory limit is reached any query will wait some time during atempt to allocate new memory. If selected query is killed and memory is freed within waiting timeout, query will continue execution after waiting, otherwise it'll be killed too. From d910357e6958be848c897962d032e00997f7ab4b Mon Sep 17 00:00:00 2001 From: tavplubix Date: Thu, 24 Mar 2022 20:25:16 +0300 Subject: [PATCH 055/111] Update test.py --- tests/integration/test_s3_zero_copy_replication/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py index 22334b0803e..1ce1047ebec 100644 --- a/tests/integration/test_s3_zero_copy_replication/test.py +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -361,6 +361,7 @@ def test_s3_zero_copy_with_ttl_delete(cluster, large_data, iterations): ) node1.query("OPTIMIZE TABLE ttl_delete_test FINAL") + node1.query("SYSTEM SYNC REPLICA ttl_delete_test") node2.query("SYSTEM SYNC REPLICA ttl_delete_test") From 78100abc5fc741e2a2e0b772d1dc0c6062f6903b Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 24 Mar 2022 17:51:18 +0000 Subject: [PATCH 056/111] add parallel parsing and schema inference for type Object --- src/Formats/registerFormats.cpp | 4 ++++ .../Impl/JSONAsStringRowInputFormat.cpp | 19 ++++++++++++++++--- .../Formats/Impl/JSONAsStringRowInputFormat.h | 10 ++++++++++ .../0_stateless/01825_type_json_btc.sh | 8 +++++++- 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index 210ef1953b1..8c5955b2108 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -13,6 +13,7 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory); void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory); void registerFileSegmentationEngineRegexp(FormatFactory & factory); void registerFileSegmentationEngineJSONAsString(FormatFactory & factory); +void registerFileSegmentationEngineJSONAsObject(FormatFactory & factory); void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory); /// Formats for both input/output. @@ -103,6 +104,7 @@ void registerProtobufSchemaReader(FormatFactory & factory); void registerProtobufListSchemaReader(FormatFactory & factory); void registerLineAsStringSchemaReader(FormatFactory & factory); void registerJSONAsStringSchemaReader(FormatFactory & factory); +void registerJSONAsObjectSchemaReader(FormatFactory & factory); void registerRawBLOBSchemaReader(FormatFactory & factory); void registerMsgPackSchemaReader(FormatFactory & factory); void registerCapnProtoSchemaReader(FormatFactory & factory); @@ -123,6 +125,7 @@ void registerFormats() registerFileSegmentationEngineJSONEachRow(factory); registerFileSegmentationEngineRegexp(factory); registerFileSegmentationEngineJSONAsString(factory); + registerFileSegmentationEngineJSONAsObject(factory); registerFileSegmentationEngineJSONCompactEachRow(factory); registerInputFormatNative(factory); @@ -207,6 +210,7 @@ void registerFormats() registerProtobufListSchemaReader(factory); registerLineAsStringSchemaReader(factory); registerJSONAsStringSchemaReader(factory); + registerJSONAsObjectSchemaReader(factory); registerRawBLOBSchemaReader(factory); registerMsgPackSchemaReader(factory); registerCapnProtoSchemaReader(factory); diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index 914ec27fc46..9bf1682b77e 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -228,6 +228,14 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factor factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsString", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } +void registerJSONAsStringSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("JSONAsString", [](const FormatSettings &) + { + return std::make_shared(); + }); +} + void registerInputFormatJSONAsObject(FormatFactory & factory) { factory.registerInputFormat("JSONAsObject", []( @@ -245,11 +253,16 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(FormatFactory & factor factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsObject", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } -void registerJSONAsStringSchemaReader(FormatFactory & factory) +void registerFileSegmentationEngineJSONAsObject(FormatFactory & factory) { - factory.registerExternalSchemaReader("JSONAsString", [](const FormatSettings &) + factory.registerFileSegmentationEngine("JSONAsObject", &fileSegmentationEngineJSONEachRow); +} + +void registerJSONAsObjectSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("JSONAsObject", [](const FormatSettings &) { - return std::make_shared(); + return std::make_shared(); }); } diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h index f7880eac867..438107e73e6 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -73,4 +74,13 @@ public: } }; +class JSONAsObjectExternalSchemaReader : public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"json", std::make_shared("json", false)}}; + } +}; + } diff --git a/tests/queries/0_stateless/01825_type_json_btc.sh b/tests/queries/0_stateless/01825_type_json_btc.sh index bfa209b72d1..7647328ca60 100755 --- a/tests/queries/0_stateless/01825_type_json_btc.sh +++ b/tests/queries/0_stateless/01825_type_json_btc.sh @@ -5,11 +5,15 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh +user_files_path=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +mkdir -p ${user_files_path}/ +cp $CUR_DIR/data_json/btc_transactions.json ${user_files_path}/ + ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS btc" ${CLICKHOUSE_CLIENT} -q "CREATE TABLE btc (data JSON) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1 -cat $CUR_DIR/data_json/btc_transactions.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO btc FORMAT JSONAsObject" +${CLICKHOUSE_CLIENT} -q "INSERT INTO btc SELECT * FROM file('btc_transactions.json', 'JSONAsObject')" ${CLICKHOUSE_CLIENT} -q "SELECT count() FROM btc WHERE NOT ignore(*)" ${CLICKHOUSE_CLIENT} -q "DESC btc SETTINGS describe_extend_object_types = 1" @@ -21,3 +25,5 @@ ${CLICKHOUSE_CLIENT} -q "SELECT avg(length(data.inputs.prev_out.spending_outpoin ${CLICKHOUSE_CLIENT} -q "SELECT data.out.spending_outpoints AS outpoints FROM btc WHERE arrayExists(x -> notEmpty(x), outpoints)" ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS btc" + +rm ${user_files_path}/btc_transactions.json From d9d9f3bc6dfea4f8a357f84de2b5f10f0b170dd7 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 24 Mar 2022 19:09:13 +0100 Subject: [PATCH 057/111] Fix --- .../ExternalDataSourceConfiguration.cpp | 5 + tests/config/config.d/named_collection.xml | 8 + .../02244_url_engine_headers_test.python | 141 ++++++++++++++++++ .../02244_url_engine_headers_test.reference | 1 + .../02244_url_engine_headers_test.sh | 7 + 5 files changed, 162 insertions(+) create mode 100644 tests/queries/0_stateless/02244_url_engine_headers_test.python create mode 100644 tests/queries/0_stateless/02244_url_engine_headers_test.reference create mode 100755 tests/queries/0_stateless/02244_url_engine_headers_test.sh diff --git a/src/Storages/ExternalDataSourceConfiguration.cpp b/src/Storages/ExternalDataSourceConfiguration.cpp index 5549a816a06..abd20e6e5fd 100644 --- a/src/Storages/ExternalDataSourceConfiguration.cpp +++ b/src/Storages/ExternalDataSourceConfiguration.cpp @@ -325,6 +325,7 @@ void URLBasedDataSourceConfiguration::set(const URLBasedDataSourceConfiguration compression_method = conf.compression_method; structure = conf.structure; http_method = conf.http_method; + headers = conf.headers; } @@ -364,6 +365,10 @@ std::optional getURLBasedDataSourceConfiguration(const { configuration.structure = config.getString(config_prefix + ".structure", ""); } + else if (key == "compression_method") + { + configuration.compression_method = config.getString(config_prefix + ".compression_method", ""); + } else if (key == "headers") { Poco::Util::AbstractConfiguration::Keys header_keys; diff --git a/tests/config/config.d/named_collection.xml b/tests/config/config.d/named_collection.xml index f3b7074e1ce..7d60533525e 100644 --- a/tests/config/config.d/named_collection.xml +++ b/tests/config/config.d/named_collection.xml @@ -14,5 +14,13 @@ default s
+ + +
+ test_header + test_header_clickhouse +
+
+
diff --git a/tests/queries/0_stateless/02244_url_engine_headers_test.python b/tests/queries/0_stateless/02244_url_engine_headers_test.python new file mode 100644 index 00000000000..f9c594233a0 --- /dev/null +++ b/tests/queries/0_stateless/02244_url_engine_headers_test.python @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 + +from http.server import SimpleHTTPRequestHandler,HTTPServer +import socket +import sys +import threading +import os +import traceback +import urllib.request +import subprocess + + +def is_ipv6(host): + try: + socket.inet_aton(host) + return False + except: + return True + +def get_local_port(host, ipv6): + if ipv6: + family = socket.AF_INET6 + else: + family = socket.AF_INET + + with socket.socket(family) as fd: + fd.bind((host, 0)) + return fd.getsockname()[1] + +CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', 'localhost') +CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') + +# Server returns this JSON response. +SERVER_JSON_RESPONSE = \ +'''{ + "login": "ClickHouse", + "id": 54801242, + "name": "ClickHouse", + "company": null +}''' + +EXPECTED_ANSWER = \ +'''{\\n\\t"login": "ClickHouse",\\n\\t"id": 54801242,\\n\\t"name": "ClickHouse",\\n\\t"company": null\\n}''' + +##################################################################################### +# This test starts an HTTP server and serves data to clickhouse url-engine based table. +# The objective of this test is to check the ClickHouse server provides a User-Agent +# with HTTP requests. +# In order for it to work ip+port of http server (given below) should be +# accessible from clickhouse server. +##################################################################################### + +# IP-address of this host accessible from the outside world. Get the first one +HTTP_SERVER_HOST = subprocess.check_output(['hostname', '-i']).decode('utf-8').strip().split()[0] +IS_IPV6 = is_ipv6(HTTP_SERVER_HOST) +HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6) + +# IP address and port of the HTTP server started from this script. +HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT) +if IS_IPV6: + HTTP_SERVER_URL_STR = 'http://' + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/" +else: + HTTP_SERVER_URL_STR = 'http://' + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/" + + +def get_ch_answer(query): + host = CLICKHOUSE_HOST + if IS_IPV6: + host = f'[{host}]' + + url = os.environ.get('CLICKHOUSE_URL', 'http://{host}:{port}'.format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP)) + return urllib.request.urlopen(url, data=query.encode()).read().decode() + +def check_answers(query, answer): + ch_answer = get_ch_answer(query) + if ch_answer.strip() != answer.strip(): + print("FAIL on query:", query, file=sys.stderr) + print("Expected answer:", answer, file=sys.stderr) + print("Fetched answer :", ch_answer, file=sys.stderr) + raise Exception("Fail on query") + +class HttpProcessor(SimpleHTTPRequestHandler): + def _set_headers(self): + test_header = self.headers.get('test_header') + if test_header and test_header.startswith('test_header_clickhouse'): + self.send_response(200) + else: + self.send_response(403) + + self.send_header('Content-Type', 'text/csv') + self.end_headers() + + def do_GET(self): + self._set_headers() + self.wfile.write(SERVER_JSON_RESPONSE.encode()) + + def log_message(self, format, *args): + return + +class HTTPServerV6(HTTPServer): + address_family = socket.AF_INET6 + +def start_server(requests_amount): + if IS_IPV6: + httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, HttpProcessor) + else: + httpd = HTTPServer(HTTP_SERVER_ADDRESS, HttpProcessor) + + def real_func(): + for i in range(requests_amount): + httpd.handle_request() + + t = threading.Thread(target=real_func) + return t + +##################################################################### +# Testing area. +##################################################################### + +def test_select(): + global HTTP_SERVER_URL_STR + query = 'SELECT * FROM url(url_with_headers, url=\'{}\', format=\'JSONAsString\');'.format(HTTP_SERVER_URL_STR) + check_answers(query, EXPECTED_ANSWER) + +def main(): + t = start_server(2) + t.start() + test_select() + t.join() + print("PASSED") + +if __name__ == "__main__": + try: + main() + except Exception as ex: + exc_type, exc_value, exc_traceback = sys.exc_info() + traceback.print_tb(exc_traceback, file=sys.stderr) + print(ex, file=sys.stderr) + sys.stderr.flush() + + os._exit(1) diff --git a/tests/queries/0_stateless/02244_url_engine_headers_test.reference b/tests/queries/0_stateless/02244_url_engine_headers_test.reference new file mode 100644 index 00000000000..53cdf1e9393 --- /dev/null +++ b/tests/queries/0_stateless/02244_url_engine_headers_test.reference @@ -0,0 +1 @@ +PASSED diff --git a/tests/queries/0_stateless/02244_url_engine_headers_test.sh b/tests/queries/0_stateless/02244_url_engine_headers_test.sh new file mode 100755 index 00000000000..37d89907d79 --- /dev/null +++ b/tests/queries/0_stateless/02244_url_engine_headers_test.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +python3 "$CURDIR"/02244_url_engine_headers_test.python From 1c2afcdee177193c6973d4e18c3ba1e071901576 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 24 Mar 2022 18:22:41 +0000 Subject: [PATCH 058/111] better test --- tests/queries/0_stateless/01825_type_json_btc.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/01825_type_json_btc.sh b/tests/queries/0_stateless/01825_type_json_btc.sh index 7647328ca60..f11b952ae3b 100755 --- a/tests/queries/0_stateless/01825_type_json_btc.sh +++ b/tests/queries/0_stateless/01825_type_json_btc.sh @@ -6,14 +6,15 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CUR_DIR"/../shell_config.sh user_files_path=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') -mkdir -p ${user_files_path}/ -cp $CUR_DIR/data_json/btc_transactions.json ${user_files_path}/ +mkdir -p ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/ +rm -rf ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME:?}/* +cp $CUR_DIR/data_json/btc_transactions.json ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/ ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS btc" ${CLICKHOUSE_CLIENT} -q "CREATE TABLE btc (data JSON) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1 -${CLICKHOUSE_CLIENT} -q "INSERT INTO btc SELECT * FROM file('btc_transactions.json', 'JSONAsObject')" +${CLICKHOUSE_CLIENT} -q "INSERT INTO btc SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/btc_transactions.json', 'JSONAsObject')" ${CLICKHOUSE_CLIENT} -q "SELECT count() FROM btc WHERE NOT ignore(*)" ${CLICKHOUSE_CLIENT} -q "DESC btc SETTINGS describe_extend_object_types = 1" @@ -26,4 +27,4 @@ ${CLICKHOUSE_CLIENT} -q "SELECT data.out.spending_outpoints AS outpoints FROM bt ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS btc" -rm ${user_files_path}/btc_transactions.json +rm ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/btc_transactions.json From 7c0bdbfa930c82b3f23f4207c040f99dc25106d6 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 24 Mar 2022 20:24:05 +0100 Subject: [PATCH 059/111] fix stupid bug --- src/Interpreters/InterpreterCreateQuery.cpp | 7 +++---- tests/queries/0_stateless/01107_atomic_db_detach_attach.sh | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index f7dbd1c8b65..d8923b3cc42 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1180,11 +1180,10 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, /// old instance of the storage. For example, AsynchronousMetrics may cause ATTACH to fail, /// so we allow waiting here. If database_atomic_wait_for_drop_and_detach_synchronously is disabled /// and old storage instance still exists it will throw exception. - bool throw_if_table_in_use = getContext()->getSettingsRef().database_atomic_wait_for_drop_and_detach_synchronously; - if (throw_if_table_in_use) - database->checkDetachedTableNotInUse(create.uuid); - else + if (getContext()->getSettingsRef().database_atomic_wait_for_drop_and_detach_synchronously) database->waitDetachedTableNotInUse(create.uuid); + else + database->checkDetachedTableNotInUse(create.uuid); } StoragePtr res; diff --git a/tests/queries/0_stateless/01107_atomic_db_detach_attach.sh b/tests/queries/0_stateless/01107_atomic_db_detach_attach.sh index 300d32f5a0c..60650cb9cc3 100755 --- a/tests/queries/0_stateless/01107_atomic_db_detach_attach.sh +++ b/tests/queries/0_stateless/01107_atomic_db_detach_attach.sh @@ -13,7 +13,7 @@ $CLICKHOUSE_CLIENT -q "INSERT INTO test_01107.mt SELECT number + sleepEachRow(3) sleep 1 $CLICKHOUSE_CLIENT -q "DETACH TABLE test_01107.mt" --database_atomic_wait_for_drop_and_detach_synchronously=0 -$CLICKHOUSE_CLIENT -q "ATTACH TABLE test_01107.mt" 2>&1 | grep -F "Code: 57" > /dev/null && echo "OK" +$CLICKHOUSE_CLIENT -q "ATTACH TABLE test_01107.mt" --database_atomic_wait_for_drop_and_detach_synchronously=0 2>&1 | grep -F "Code: 57" > /dev/null && echo "OK" $CLICKHOUSE_CLIENT -q "DETACH DATABASE test_01107" --database_atomic_wait_for_drop_and_detach_synchronously=0 2>&1 | grep -F "Code: 219" > /dev/null && echo "OK" wait From c63bc052929cc133bccf3f0747a1ae9cbc8b8878 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 24 Mar 2022 19:32:14 +0000 Subject: [PATCH 060/111] Add test --- .../0_stateless/01675_data_type_coroutine.reference | 1 + .../queries/0_stateless/01675_data_type_coroutine.sh | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/tests/queries/0_stateless/01675_data_type_coroutine.reference b/tests/queries/0_stateless/01675_data_type_coroutine.reference index 7326d960397..541dab48def 100644 --- a/tests/queries/0_stateless/01675_data_type_coroutine.reference +++ b/tests/queries/0_stateless/01675_data_type_coroutine.reference @@ -1 +1,2 @@ Ok +Ok diff --git a/tests/queries/0_stateless/01675_data_type_coroutine.sh b/tests/queries/0_stateless/01675_data_type_coroutine.sh index 8e80d722a4c..98a3d351cd2 100755 --- a/tests/queries/0_stateless/01675_data_type_coroutine.sh +++ b/tests/queries/0_stateless/01675_data_type_coroutine.sh @@ -6,6 +6,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) counter=0 retries=60 + I=0 while [[ $counter -lt $retries ]]; do I=$((I + 1)) @@ -14,5 +15,16 @@ while [[ $counter -lt $retries ]]; do ((++counter)) done +echo 'Ok' + +counter=0 +I=0 +while [[ $counter -lt $retries ]]; do + I=$((I + 1)) + TYPE=$(perl -e "print 'Array(' x $I; print 'UInt8'; print ')' x $I") + ${CLICKHOUSE_CLIENT} --prefer_localhost_replica=0 --max_parser_depth 1000000 --query "SELECT * FROM remote('127.0.0.{1,2}', generateRandom('x $TYPE', 1, 1, 1)) LIMIT 1 FORMAT Null" 2>&1 | grep -q -F 'Maximum parse depth' && break; + ((++counter)) +done + #echo "I = ${I}" echo 'Ok' From a2c4073ca1d658f87c271876b3007975896b6901 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Thu, 24 Mar 2022 22:48:24 +0300 Subject: [PATCH 061/111] [docs] remove Metrika counter --- website/js/base.js | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/website/js/base.js b/website/js/base.js index 6704231c69d..a2356cb565c 100644 --- a/website/js/base.js +++ b/website/js/base.js @@ -70,15 +70,6 @@ (function (d, w, c) { (w[c] = w[c] || []).push(function() { var is_single_page = $('html').attr('data-single-page') === 'true'; - try { - w.yaCounter18343495 = new Ya.Metrika2({ - id: 18343495, - clickmap: !is_single_page, - trackLinks: !is_single_page, - accurateTrackBounce: !is_single_page, - webvisor: !is_single_page - }); - } catch(e) { } if (!is_single_page) { $('head').each(function(_, element) { @@ -91,20 +82,6 @@ }); } }); - - var n = d.getElementsByTagName("script")[0], - s = d.createElement("script"), - f = function () { n.parentNode.insertBefore(s, n); }; - s.type = "text/javascript"; - s.async = true; - s.src = "/js/metrika.js"; - if (window.location.hostname.endsWith('clickhouse.com')) { - if (w.opera == "[object Opera]") { - d.addEventListener("DOMContentLoaded", f, false); - } else { - f(); - } - } })(document, window, "yandex_metrika_callbacks2"); var beforePrint = function() { From 123ea5117fb2c169c017e6fccd3f3ffd1d86a75a Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Thu, 24 Mar 2022 22:55:26 +0300 Subject: [PATCH 062/111] Update base.js --- website/js/base.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/js/base.js b/website/js/base.js index a2356cb565c..9389028f1ef 100644 --- a/website/js/base.js +++ b/website/js/base.js @@ -82,7 +82,7 @@ }); } }); - })(document, window, "yandex_metrika_callbacks2"); + })(document, window, ""); var beforePrint = function() { var details = document.getElementsByTagName("details"); From 780cc37479dbff5196d8fd0733302629d60c6df8 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 24 Mar 2022 20:29:12 +0000 Subject: [PATCH 063/111] Make some tests more stable --- tests/queries/0_stateless/01059_storage_file_compression.sh | 4 ++-- tests/queries/0_stateless/01529_bad_memory_tracking.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/01059_storage_file_compression.sh b/tests/queries/0_stateless/01059_storage_file_compression.sh index ab56caee3fe..9d928986638 100755 --- a/tests/queries/0_stateless/01059_storage_file_compression.sh +++ b/tests/queries/0_stateless/01059_storage_file_compression.sh @@ -16,10 +16,10 @@ do ${CLICKHOUSE_CLIENT} --query "DROP TABLE file" done -${CLICKHOUSE_CLIENT} --query "SELECT count(), max(x) FROM file('${CLICKHOUSE_DATABASE}/{gz,br,xz,zst,lz4,bz2}.tsv.{gz,br,xz,zst,lz4,bz2}', TSV, 'x UInt64')" +${CLICKHOUSE_CLIENT} --max_read_buffer_size=1048576 --query "SELECT count(), max(x) FROM file('${CLICKHOUSE_DATABASE}/{gz,br,xz,zst,lz4,bz2}.tsv.{gz,br,xz,zst,lz4,bz2}', TSV, 'x UInt64')" for m in gz br xz zst lz4 bz2 do - ${CLICKHOUSE_CLIENT} --query "SELECT count() < 4000000, max(x) FROM file('${CLICKHOUSE_DATABASE}/${m}.tsv.${m}', RowBinary, 'x UInt8', 'none')" + ${CLICKHOUSE_CLIENT} --max_read_buffer_size=1048576 --query "SELECT count() < 4000000, max(x) FROM file('${CLICKHOUSE_DATABASE}/${m}.tsv.${m}', RowBinary, 'x UInt8', 'none')" done diff --git a/tests/queries/0_stateless/01529_bad_memory_tracking.sh b/tests/queries/0_stateless/01529_bad_memory_tracking.sh index f5d096cc799..d12623d04b9 100755 --- a/tests/queries/0_stateless/01529_bad_memory_tracking.sh +++ b/tests/queries/0_stateless/01529_bad_memory_tracking.sh @@ -8,6 +8,6 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/../shell_config.sh for _ in {1..10}; do - ${CLICKHOUSE_CLIENT} --max_memory_usage '10G' --query "SELECT i FROM generateRandom('i Array(Int8)', 1, 1, 1048577) LIMIT 65536" |& grep -v -e 'Received exception from server' -e 'Code: 241' -e '(query: ' + ${CLICKHOUSE_CLIENT} --max_block_size=65505 --max_memory_usage '10G' --query "SELECT i FROM generateRandom('i Array(Int8)', 1, 1, 1048577) LIMIT 65536" |& grep -v -e 'Received exception from server' -e 'Code: 241' -e '(query: ' done exit 0 From aedea58741f710c9cf26ff5d6dca75952412b8c8 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 24 Mar 2022 22:10:20 +0100 Subject: [PATCH 064/111] Mark test as long --- tests/queries/0_stateless/01675_data_type_coroutine.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/01675_data_type_coroutine.sh b/tests/queries/0_stateless/01675_data_type_coroutine.sh index 98a3d351cd2..9f7d5401bd2 100755 --- a/tests/queries/0_stateless/01675_data_type_coroutine.sh +++ b/tests/queries/0_stateless/01675_data_type_coroutine.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: long CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 52f20041c1c33cd5744f5ce4a0bd2f1e0da8b647 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 24 Mar 2022 21:12:26 +0000 Subject: [PATCH 065/111] Fix test 01091_num_threads --- tests/queries/0_stateless/01091_num_threads.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/01091_num_threads.sql b/tests/queries/0_stateless/01091_num_threads.sql index e32d663880f..faeceb0e6d6 100644 --- a/tests/queries/0_stateless/01091_num_threads.sql +++ b/tests/queries/0_stateless/01091_num_threads.sql @@ -1,5 +1,6 @@ set log_queries=1; set log_query_threads=1; +set max_threads=0; WITH 01091 AS id SELECT 1; SYSTEM FLUSH LOGS; From f133531bc0a886c4b3b43374d906127cbba29b0f Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 24 Mar 2022 22:19:30 +0100 Subject: [PATCH 066/111] Functions cast into IPv4, IPV6 add backward incompatible section into changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61724ab2d0c..100b03ab92b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ * Make `arrayCompact` function behave as other higher-order functions: perform compaction not of lambda function results but on the original array. If you're using nontrivial lambda functions in arrayCompact you may restore old behaviour by wrapping `arrayCompact` arguments into `arrayMap`. Closes [#34010](https://github.com/ClickHouse/ClickHouse/issues/34010) [#18535](https://github.com/ClickHouse/ClickHouse/issues/18535) [#14778](https://github.com/ClickHouse/ClickHouse/issues/14778). [#34795](https://github.com/ClickHouse/ClickHouse/pull/34795) ([Alexandre Snarskii](https://github.com/snar)). * Change implementation specific behavior on overflow of function `toDatetime`. It will be saturated to the nearest min/max supported instant of datetime instead of wraparound. This change is highlighted as "backward incompatible" because someone may unintentionally rely on the old behavior. [#32898](https://github.com/ClickHouse/ClickHouse/pull/32898) ([HaiBo Li](https://github.com/marising)). +* Make function `cast(value, 'IPv4')`, `cast(value, 'IPv6')` behave same as `toIPv4`, `toIPv6` functions. Changed behavior of incorrect IP address passed into functions `toIPv4`,` toIPv6`, now if invalid IP address passes into this functions exception will be raised, before this function return default value. Added functions `IPv4StringToNumOrDefault`, `IPv4StringToNumOrNull`, `IPv6StringToNumOrDefault`, `IPv6StringOrNull` `toIPv4OrDefault`, `toIPv4OrNull`, `toIPv6OrDefault`, `toIPv6OrNull`. Functions `IPv4StringToNumOrDefault `, `toIPv4OrDefault `, `toIPv6OrDefault ` should be used if previous logic relied on `IPv4StringToNum`, `toIPv4`, `toIPv6` returning default value for invalid address. Added setting `cast_ipv4_ipv6_default_on_conversion_error`, if this setting enabled, then IP address conversion functions will behave as before. Closes [#22825](https://github.com/ClickHouse/ClickHouse/issues/22825). Closes [#5799](https://github.com/ClickHouse/ClickHouse/issues/5799). Closes [#35156](https://github.com/ClickHouse/ClickHouse/issues/35156). [#35240](https://github.com/ClickHouse/ClickHouse/pull/35240) ([Maksim Kita](https://github.com/kitaisreal)). #### New Feature @@ -366,7 +367,7 @@ #### Improvement -* Now date time conversion functions that generates time before `1970-01-01 00:00:00` will be saturated to zero instead of overflow. [#29953](https://github.com/ClickHouse/ClickHouse/pull/29953) ([Amos Bird](https://github.com/amosbird)). It also fixes a bug in index analysis if date truncation function would yield result before the Unix epoch. +* Now date time conversion functions that generates time before `1970-01-01 00:00:00` will be saturated to zero instead of overflow. [#29953](https://github.com/ClickHouse/ClickHouse/pull/29953) ([Amos Bird](https://github.com/amosbird)). It also fixes a bug in index analysis if date truncation function would yield result before the Unix epoch. * Always display resource usage (total CPU usage, total RAM usage and max RAM usage per host) in client. [#33271](https://github.com/ClickHouse/ClickHouse/pull/33271) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Improve `Bool` type serialization and deserialization, check the range of values. [#32984](https://github.com/ClickHouse/ClickHouse/pull/32984) ([Kruglov Pavel](https://github.com/Avogar)). * If an invalid setting is defined using the `SET` query or using the query parameters in the HTTP request, error message will contain suggestions that are similar to the invalid setting string (if any exists). [#32946](https://github.com/ClickHouse/ClickHouse/pull/32946) ([Antonio Andelic](https://github.com/antonio2368)). From 572b15a63c3abc5096f80801cf92c64c1d235118 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 24 Mar 2022 14:32:43 +0100 Subject: [PATCH 067/111] Fix paths and some cmake typos --- CMakeLists.txt | 2 +- cmake/strip_binary.cmake | 6 +++--- packages/clickhouse-common-static-dbg.yaml | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ed3872fd6e..deef582c790 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -266,7 +266,7 @@ if (OBJCOPY_PATH AND YANDEX_OFFICIAL_BUILD AND (NOT CMAKE_TOOLCHAIN_FILE)) endif () # Allows to build stripped binary in a separate directory -if (OBJCOPY_PATH AND READELF_PATH) +if (OBJCOPY_PATH AND STRIP_PATH) option(INSTALL_STRIPPED_BINARIES "Build stripped binaries with debug info in separate directory" OFF) if (INSTALL_STRIPPED_BINARIES) set(STRIPPED_BINARIES_OUTPUT "stripped" CACHE STRING "A separate directory for stripped information") diff --git a/cmake/strip_binary.cmake b/cmake/strip_binary.cmake index 1547a814913..1f24790a159 100644 --- a/cmake/strip_binary.cmake +++ b/cmake/strip_binary.cmake @@ -22,12 +22,12 @@ macro(clickhouse_strip_binary) COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" - COMMAND "${OBJCOPY_PATH}" --add-gnu-debuglink "${CMAKE_INSTALL_LIBDIR}/debug/${CMAKE_INSTALL_BINDIR}/${STRIP_TARGET}.debug" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" + COMMAND "${OBJCOPY_PATH}" --add-gnu-debuglink "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" COMMENT "Stripping clickhouse binary" VERBATIM ) install(PROGRAMS ${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET} DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) - install(FILES ${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug DESTINATION ${CMAKE_INSTALL_LIBDIR}/debug/${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) + install(FILES ${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug DESTINATION ${CMAKE_INSTALL_LIBDIR}/debug/${CMAKE_INSTALL_FULL_BINDIR}/${STRIP_TARGET}.debug COMPONENT clickhouse) endmacro() @@ -49,5 +49,5 @@ macro(clickhouse_make_empty_debug_info_for_nfpm) COMMENT "Addiding empty debug info for NFPM" VERBATIM ) - install(FILES "${EMPTY_DEBUG_DESTINATION_DIR}/lib/debug/${EMPTY_DEBUG_TARGET}.debug" DESTINATION "${CMAKE_INSTALL_LIBDIR}/debug" COMPONENT clickhouse) + install(FILES "${EMPTY_DEBUG_DESTINATION_DIR}/lib/debug/${EMPTY_DEBUG_TARGET}.debug" DESTINATION "${CMAKE_INSTALL_LIBDIR}/debug/${CMAKE_INSTALL_FULL_BINDIR}" COMPONENT clickhouse) endmacro() diff --git a/packages/clickhouse-common-static-dbg.yaml b/packages/clickhouse-common-static-dbg.yaml index 349f9ec0c47..12a1594bd30 100644 --- a/packages/clickhouse-common-static-dbg.yaml +++ b/packages/clickhouse-common-static-dbg.yaml @@ -21,12 +21,12 @@ description: | This package contains the debugging symbols for clickhouse-common. contents: -- src: root/usr/lib/debug/clickhouse.debug - dst: /usr/lib/debug/clickhouse.debug -- src: root/usr/lib/debug/clickhouse-odbc-bridge.debug - dst: /usr/lib/debug/clickhouse-odbc-bridge.debug -- src: root/usr/lib/debug/clickhouse-library-bridge.debug - dst: /usr/lib/debug/clickhouse-library-bridge.debug +- src: root/usr/lib/debug/usr/bin/clickhouse.debug + dst: /usr/lib/debug/usr/bin/clickhouse.debug +- src: root/usr/lib/debug/usr/bin/clickhouse-odbc-bridge.debug + dst: /usr/lib/debug/usr/bin/clickhouse-odbc-bridge.debug +- src: root/usr/lib/debug/usr/bin/clickhouse-library-bridge.debug + dst: /usr/lib/debug/usr/bin/clickhouse-library-bridge.debug # docs - src: ../AUTHORS dst: /usr/share/doc/clickhouse-common-static-dbg/AUTHORS From 64f79f0c69699a66fe69f6c806bedef9d0dead68 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Fri, 25 Mar 2022 00:14:26 +0100 Subject: [PATCH 068/111] Added an ability to specify cluster secret in replicated database (#35333) --- src/Databases/DatabaseReplicated.cpp | 29 +++++++++++++++++----- src/Databases/DatabaseReplicated.h | 10 ++++++++ src/Databases/DatabaseReplicatedSettings.h | 7 +++--- src/Interpreters/Cluster.cpp | 22 +++++++++++++--- src/Interpreters/Cluster.h | 8 ++++-- 5 files changed, 61 insertions(+), 15 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index d9d9f5b45f6..0c3cc56c061 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -88,6 +88,9 @@ DatabaseReplicated::DatabaseReplicated( /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. if (zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; + + if (!db_settings.collection_name.value.empty()) + fillClusterAuthInfo(db_settings.collection_name.value, context_->getConfigRef()); } String DatabaseReplicated::getFullReplicaName() const @@ -191,22 +194,36 @@ ClusterPtr DatabaseReplicated::getClusterImpl() const shards.back().emplace_back(unescapeForFileName(host_port)); } - String username = db_settings.cluster_username; - String password = db_settings.cluster_password; UInt16 default_port = getContext()->getTCPPort(); - bool secure = db_settings.cluster_secure_connection; bool treat_local_as_remote = false; bool treat_local_port_as_remote = getContext()->getApplicationType() == Context::ApplicationType::LOCAL; return std::make_shared( getContext()->getSettingsRef(), shards, - username, - password, + cluster_auth_info.cluster_username, + cluster_auth_info.cluster_password, default_port, treat_local_as_remote, treat_local_port_as_remote, - secure); + cluster_auth_info.cluster_secure_connection, + /*priority=*/1, + database_name, + cluster_auth_info.cluster_secret); +} + + +void DatabaseReplicated::fillClusterAuthInfo(String collection_name, const Poco::Util::AbstractConfiguration & config_ref) +{ + const auto & config_prefix = fmt::format("named_collections.{}", collection_name); + + if (!config_ref.has(config_prefix)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no collection named `{}` in config", collection_name); + + cluster_auth_info.cluster_username = config_ref.getString(config_prefix + ".cluster_username", ""); + cluster_auth_info.cluster_password = config_ref.getString(config_prefix + ".cluster_password", ""); + cluster_auth_info.cluster_secret = config_ref.getString(config_prefix + ".cluster_secret", ""); + cluster_auth_info.cluster_secure_connection = config_ref.getBool(config_prefix + ".cluster_secure_connection", false); } void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index fcb8a2c4d33..ac212e168b8 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -75,6 +75,16 @@ private: bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); + struct + { + String cluster_username{"default"}; + String cluster_password; + String cluster_secret; + bool cluster_secure_connection{false}; + } cluster_auth_info; + + void fillClusterAuthInfo(String collection_name, const Poco::Util::AbstractConfiguration & config); + void checkQueryValid(const ASTPtr & query, ContextPtr query_context) const; void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr); diff --git a/src/Databases/DatabaseReplicatedSettings.h b/src/Databases/DatabaseReplicatedSettings.h index 0aff26712c0..8bed1ababf6 100644 --- a/src/Databases/DatabaseReplicatedSettings.h +++ b/src/Databases/DatabaseReplicatedSettings.h @@ -8,12 +8,11 @@ namespace DB class ASTStorage; #define LIST_OF_DATABASE_REPLICATED_SETTINGS(M) \ - M(Float, max_broken_tables_ratio, 0.5, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \ + M(Float, max_broken_tables_ratio, 0.5, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \ M(UInt64, max_replication_lag_to_enqueue, 10, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \ M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \ - M(String, cluster_username, "default", "Username to use when connecting to hosts of cluster", 0) \ - M(String, cluster_password, "", "Password to use when connecting to hosts of cluster", 0) \ - M(Bool, cluster_secure_connection, false, "Enable TLS when connecting to hosts of cluster", 0) \ + M(String, collection_name, "", "A name of a collection defined in server's config where all info for cluster authentication is defined", 0) \ + DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index d558d1cfd67..1039fac6883 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -132,7 +132,9 @@ Cluster::Address::Address( bool secure_, Int64 priority_, UInt32 shard_index_, - UInt32 replica_index_) + UInt32 replica_index_, + String cluster_name_, + String cluster_secret_) : user(user_), password(password_) { bool can_be_local = true; @@ -164,6 +166,8 @@ Cluster::Address::Address( is_local = can_be_local && isLocal(clickhouse_port); shard_index = shard_index_; replica_index = replica_index_; + cluster = cluster_name_; + cluster_secret = cluster_secret_; } @@ -537,10 +541,14 @@ Cluster::Cluster( bool treat_local_as_remote, bool treat_local_port_as_remote, bool secure, - Int64 priority) + Int64 priority, + String cluster_name, + String cluster_secret) { UInt32 current_shard_num = 1; + secret = cluster_secret; + for (const auto & shard : names) { Addresses current; @@ -554,7 +562,9 @@ Cluster::Cluster( secure, priority, current_shard_num, - current.size() + 1); + current.size() + 1, + cluster_name, + cluster_secret); addresses_with_failover.emplace_back(current); @@ -690,6 +700,9 @@ Cluster::Cluster(Cluster::ReplicasAsShardsTag, const Cluster & from, const Setti } } + secret = from.secret; + name = from.name; + initMisc(); } @@ -704,6 +717,9 @@ Cluster::Cluster(Cluster::SubclusterTag, const Cluster & from, const std::vector addresses_with_failover.emplace_back(from.addresses_with_failover.at(index)); } + secret = from.secret; + name = from.name; + initMisc(); } diff --git a/src/Interpreters/Cluster.h b/src/Interpreters/Cluster.h index e9f26c21089..13f19f7c0ed 100644 --- a/src/Interpreters/Cluster.h +++ b/src/Interpreters/Cluster.h @@ -55,7 +55,9 @@ public: bool treat_local_as_remote, bool treat_local_port_as_remote, bool secure = false, - Int64 priority = 1); + Int64 priority = 1, + String cluster_name = "", + String cluster_secret = ""); Cluster(const Cluster &)= delete; Cluster & operator=(const Cluster &) = delete; @@ -127,7 +129,9 @@ public: bool secure_ = false, Int64 priority_ = 1, UInt32 shard_index_ = 0, - UInt32 replica_index_ = 0); + UInt32 replica_index_ = 0, + String cluster_name = "", + String cluster_secret_ = ""); /// Returns 'escaped_host_name:port' String toString() const; From 86dcbe100c94eb802318ef29de2da3487881fed6 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 25 Mar 2022 11:25:42 +0100 Subject: [PATCH 069/111] Add dbg symbols --- packages/clickhouse-keeper-dbg.yaml | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 packages/clickhouse-keeper-dbg.yaml diff --git a/packages/clickhouse-keeper-dbg.yaml b/packages/clickhouse-keeper-dbg.yaml new file mode 100644 index 00000000000..e5d9419eb89 --- /dev/null +++ b/packages/clickhouse-keeper-dbg.yaml @@ -0,0 +1,32 @@ +# package sources should be placed in ${PWD}/root +# nfpm should run from the same directory with a config +name: "clickhouse-keeper-dbg" +arch: "${DEB_ARCH}" # amd64, arm64 +platform: "linux" +version: "${CLICKHOUSE_VERSION_STRING}" +vendor: "ClickHouse Inc." +homepage: "https://clickhouse.com" +license: "Apache" +section: "database" +priority: "optional" + +conflicts: +- clickhouse-server + +maintainer: "ClickHouse Dev Team " +description: | + debugging symbols for clickhouse-keeper + This package contains the debugging symbols for clickhouse-keeper. + +contents: +- src: root/usr/lib/debug/clickhouse-keeper.debug + dst: /usr/lib/debug/clickhouse-keeper.debug +# docs +- src: ../AUTHORS + dst: /usr/share/doc/clickhouse-keeper-dbg/AUTHORS +- src: ../CHANGELOG.md + dst: /usr/share/doc/clickhouse-keeper-dbg/CHANGELOG.md +- src: ../LICENSE + dst: /usr/share/doc/clickhouse-keeper-dbg/LICENSE +- src: ../README.md + dst: /usr/share/doc/clickhouse-keeper-dbg/README.md From 058b641d1380f3c1ac9d60d0bc21e2817a1256fa Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 25 Mar 2022 11:32:23 +0100 Subject: [PATCH 070/111] Update docs/en/operations/settings/memory-overcommit.md --- docs/en/operations/settings/memory-overcommit.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/memory-overcommit.md b/docs/en/operations/settings/memory-overcommit.md index f7d2bd0e239..3f99382b826 100644 --- a/docs/en/operations/settings/memory-overcommit.md +++ b/docs/en/operations/settings/memory-overcommit.md @@ -6,7 +6,7 @@ The idea of this technique is to introduce settings which can represent guarante When memory overcommit is enabled and the memory limit is reached ClickHouse will select the most overcommitted query and try to free memory by killing this query. When memory limit is reached any query will wait some time during atempt to allocate new memory. -If selected query is killed and memory is freed within waiting timeout, query will continue execution after waiting, otherwise it'll be killed too. +If timeout is passed and memory is freed, the query continues execution. Otherwise an exception will be thrown and the query is killed. Selection of query to stop or kill is performed by either global or user overcommit trackers depending on what memory limit is reached. From 9e11e611c7e31fc2332c52569929c12cbc4640e1 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 25 Mar 2022 11:12:02 +0000 Subject: [PATCH 071/111] Match cardinality when adding columns --- src/Interpreters/HashJoin.cpp | 23 +++++++++++++++++------ src/Interpreters/join_common.cpp | 7 ++++--- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index f55ca0dac5a..00568cfdf08 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -284,7 +284,6 @@ HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_s } JoinCommon::convertToFullColumnsInplace(right_table_keys); - JoinCommon::convertToFullColumnsInplace(sample_block_with_columns_to_add); initRightBlockStructure(data->sample_block); JoinCommon::createMissedColumns(sample_block_with_columns_to_add); @@ -963,18 +962,29 @@ public: /// If it's joinGetOrNull, we need to wrap not-nullable columns in StorageJoin. for (size_t j = 0, size = right_indexes.size(); j < size; ++j) { - const auto & column = *block.getByPosition(right_indexes[j]).column; - if (auto * nullable_col = typeid_cast(columns[j].get()); nullable_col && !column.isNullable()) - nullable_col->insertFromNotNullable(column, row_num); + auto column_from_block = block.getByPosition(right_indexes[j]); + if (type_name[j].type->lowCardinality() != column_from_block.type->lowCardinality()) + { + JoinCommon::changeLowCardinalityInplace(column_from_block); + } + + if (auto * nullable_col = typeid_cast(columns[j].get()); + nullable_col && !column_from_block.column->isNullable()) + nullable_col->insertFromNotNullable(*column_from_block.column, row_num); else - columns[j]->insertFrom(column, row_num); + columns[j]->insertFrom(*column_from_block.column, row_num); } } else { for (size_t j = 0, size = right_indexes.size(); j < size; ++j) { - columns[j]->insertFrom(*block.getByPosition(right_indexes[j]).column, row_num); + auto column_from_block = block.getByPosition(right_indexes[j]); + if (type_name[j].type->lowCardinality() != column_from_block.type->lowCardinality()) + { + JoinCommon::changeLowCardinalityInplace(column_from_block); + } + columns[j]->insertFrom(*column_from_block.column, row_num); } } } @@ -1014,6 +1024,7 @@ private: void addColumn(const ColumnWithTypeAndName & src_column, const std::string & qualified_name) { + columns.push_back(src_column.column->cloneEmpty()); columns.back()->reserve(src_column.column->size()); type_name.emplace_back(src_column.type, src_column.name, qualified_name); diff --git a/src/Interpreters/join_common.cpp b/src/Interpreters/join_common.cpp index 478df653f3b..47b792f81e9 100644 --- a/src/Interpreters/join_common.cpp +++ b/src/Interpreters/join_common.cpp @@ -326,9 +326,10 @@ ColumnRawPtrMap materializeColumnsInplaceMap(Block & block, const Names & names) for (const auto & column_name : names) { - auto & column = block.getByName(column_name).column; - column = recursiveRemoveLowCardinality(column->convertToFullColumnIfConst()); - ptrs[column_name] = column.get(); + auto & column = block.getByName(column_name); + column.column = recursiveRemoveLowCardinality(column.column->convertToFullColumnIfConst()); + column.type = recursiveRemoveLowCardinality(column.type); + ptrs[column_name] = column.column.get(); } return ptrs; From 8be73a05547305623d617265801163bad2a03762 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 25 Mar 2022 11:23:23 +0000 Subject: [PATCH 072/111] Fix test --- .../queries/0_stateless/01037_polygon_dicts_correctness_all.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01037_polygon_dicts_correctness_all.sh b/tests/queries/0_stateless/01037_polygon_dicts_correctness_all.sh index fae263b076f..c2a35a3ef63 100755 --- a/tests/queries/0_stateless/01037_polygon_dicts_correctness_all.sh +++ b/tests/queries/0_stateless/01037_polygon_dicts_correctness_all.sh @@ -34,7 +34,7 @@ CREATE TABLE test_01037.polygons_array ENGINE = Memory; " -$CLICKHOUSE_CLIENT --query="INSERT INTO test_01037.polygons_array FORMAT JSONEachRow" --max_insert_block_size=100000 < "${CURDIR}/01037_polygon_data" +$CLICKHOUSE_CLIENT --query="INSERT INTO test_01037.polygons_array FORMAT JSONEachRow" --min_chunk_bytes_for_parallel_parsing=10485760 --max_insert_block_size=100000 < "${CURDIR}/01037_polygon_data" rm "${CURDIR}"/01037_polygon_data From 200abb51dca03501abea27d8015dd10ef1f1b674 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 25 Mar 2022 09:53:56 +0100 Subject: [PATCH 073/111] Better test --- tests/config/config.d/named_collection.xml | 4 +- .../02244_url_engine_headers_test.python | 141 ------------------ .../02244_url_engine_headers_test.reference | 2 +- .../02244_url_engine_headers_test.sh | 7 - .../02244_url_engine_headers_test.sql | 1 + 5 files changed, 4 insertions(+), 151 deletions(-) delete mode 100644 tests/queries/0_stateless/02244_url_engine_headers_test.python delete mode 100755 tests/queries/0_stateless/02244_url_engine_headers_test.sh create mode 100644 tests/queries/0_stateless/02244_url_engine_headers_test.sql diff --git a/tests/config/config.d/named_collection.xml b/tests/config/config.d/named_collection.xml index 7d60533525e..bc75461d664 100644 --- a/tests/config/config.d/named_collection.xml +++ b/tests/config/config.d/named_collection.xml @@ -17,8 +17,8 @@
- test_header - test_header_clickhouse + X-ClickHouse-Format + JSONEachRow
diff --git a/tests/queries/0_stateless/02244_url_engine_headers_test.python b/tests/queries/0_stateless/02244_url_engine_headers_test.python deleted file mode 100644 index f9c594233a0..00000000000 --- a/tests/queries/0_stateless/02244_url_engine_headers_test.python +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 - -from http.server import SimpleHTTPRequestHandler,HTTPServer -import socket -import sys -import threading -import os -import traceback -import urllib.request -import subprocess - - -def is_ipv6(host): - try: - socket.inet_aton(host) - return False - except: - return True - -def get_local_port(host, ipv6): - if ipv6: - family = socket.AF_INET6 - else: - family = socket.AF_INET - - with socket.socket(family) as fd: - fd.bind((host, 0)) - return fd.getsockname()[1] - -CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', 'localhost') -CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') - -# Server returns this JSON response. -SERVER_JSON_RESPONSE = \ -'''{ - "login": "ClickHouse", - "id": 54801242, - "name": "ClickHouse", - "company": null -}''' - -EXPECTED_ANSWER = \ -'''{\\n\\t"login": "ClickHouse",\\n\\t"id": 54801242,\\n\\t"name": "ClickHouse",\\n\\t"company": null\\n}''' - -##################################################################################### -# This test starts an HTTP server and serves data to clickhouse url-engine based table. -# The objective of this test is to check the ClickHouse server provides a User-Agent -# with HTTP requests. -# In order for it to work ip+port of http server (given below) should be -# accessible from clickhouse server. -##################################################################################### - -# IP-address of this host accessible from the outside world. Get the first one -HTTP_SERVER_HOST = subprocess.check_output(['hostname', '-i']).decode('utf-8').strip().split()[0] -IS_IPV6 = is_ipv6(HTTP_SERVER_HOST) -HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6) - -# IP address and port of the HTTP server started from this script. -HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT) -if IS_IPV6: - HTTP_SERVER_URL_STR = 'http://' + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/" -else: - HTTP_SERVER_URL_STR = 'http://' + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/" - - -def get_ch_answer(query): - host = CLICKHOUSE_HOST - if IS_IPV6: - host = f'[{host}]' - - url = os.environ.get('CLICKHOUSE_URL', 'http://{host}:{port}'.format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP)) - return urllib.request.urlopen(url, data=query.encode()).read().decode() - -def check_answers(query, answer): - ch_answer = get_ch_answer(query) - if ch_answer.strip() != answer.strip(): - print("FAIL on query:", query, file=sys.stderr) - print("Expected answer:", answer, file=sys.stderr) - print("Fetched answer :", ch_answer, file=sys.stderr) - raise Exception("Fail on query") - -class HttpProcessor(SimpleHTTPRequestHandler): - def _set_headers(self): - test_header = self.headers.get('test_header') - if test_header and test_header.startswith('test_header_clickhouse'): - self.send_response(200) - else: - self.send_response(403) - - self.send_header('Content-Type', 'text/csv') - self.end_headers() - - def do_GET(self): - self._set_headers() - self.wfile.write(SERVER_JSON_RESPONSE.encode()) - - def log_message(self, format, *args): - return - -class HTTPServerV6(HTTPServer): - address_family = socket.AF_INET6 - -def start_server(requests_amount): - if IS_IPV6: - httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, HttpProcessor) - else: - httpd = HTTPServer(HTTP_SERVER_ADDRESS, HttpProcessor) - - def real_func(): - for i in range(requests_amount): - httpd.handle_request() - - t = threading.Thread(target=real_func) - return t - -##################################################################### -# Testing area. -##################################################################### - -def test_select(): - global HTTP_SERVER_URL_STR - query = 'SELECT * FROM url(url_with_headers, url=\'{}\', format=\'JSONAsString\');'.format(HTTP_SERVER_URL_STR) - check_answers(query, EXPECTED_ANSWER) - -def main(): - t = start_server(2) - t.start() - test_select() - t.join() - print("PASSED") - -if __name__ == "__main__": - try: - main() - except Exception as ex: - exc_type, exc_value, exc_traceback = sys.exc_info() - traceback.print_tb(exc_traceback, file=sys.stderr) - print(ex, file=sys.stderr) - sys.stderr.flush() - - os._exit(1) diff --git a/tests/queries/0_stateless/02244_url_engine_headers_test.reference b/tests/queries/0_stateless/02244_url_engine_headers_test.reference index 53cdf1e9393..7b96f09d82e 100644 --- a/tests/queries/0_stateless/02244_url_engine_headers_test.reference +++ b/tests/queries/0_stateless/02244_url_engine_headers_test.reference @@ -1 +1 @@ -PASSED +{"12":12}\n diff --git a/tests/queries/0_stateless/02244_url_engine_headers_test.sh b/tests/queries/0_stateless/02244_url_engine_headers_test.sh deleted file mode 100755 index 37d89907d79..00000000000 --- a/tests/queries/0_stateless/02244_url_engine_headers_test.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CURDIR"/../shell_config.sh - -python3 "$CURDIR"/02244_url_engine_headers_test.python diff --git a/tests/queries/0_stateless/02244_url_engine_headers_test.sql b/tests/queries/0_stateless/02244_url_engine_headers_test.sql new file mode 100644 index 00000000000..e71a933346c --- /dev/null +++ b/tests/queries/0_stateless/02244_url_engine_headers_test.sql @@ -0,0 +1 @@ +select * from url(url_with_headers, url='http://127.0.0.1:8123?query=select+12', format='RawBLOB'); From c954e1038382380f75a087f7e24cef926d330116 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 25 Mar 2022 12:00:02 +0000 Subject: [PATCH 074/111] add tests for hash join with LowCardinality --- .../02244_lowcardinality_hash_join.reference | 4 +++ .../02244_lowcardinality_hash_join.sql | 27 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 tests/queries/0_stateless/02244_lowcardinality_hash_join.reference create mode 100644 tests/queries/0_stateless/02244_lowcardinality_hash_join.sql diff --git a/tests/queries/0_stateless/02244_lowcardinality_hash_join.reference b/tests/queries/0_stateless/02244_lowcardinality_hash_join.reference new file mode 100644 index 00000000000..d89bbd39cdc --- /dev/null +++ b/tests/queries/0_stateless/02244_lowcardinality_hash_join.reference @@ -0,0 +1,4 @@ +x x +x x +x x +x x diff --git a/tests/queries/0_stateless/02244_lowcardinality_hash_join.sql b/tests/queries/0_stateless/02244_lowcardinality_hash_join.sql new file mode 100644 index 00000000000..f2a601adf06 --- /dev/null +++ b/tests/queries/0_stateless/02244_lowcardinality_hash_join.sql @@ -0,0 +1,27 @@ +-- Tags: no-parallel +DROP TABLE IF EXISTS lc_table; + +CREATE TABLE lc_table +( + col LowCardinality(String) +) ENGINE=TinyLog; + +INSERT INTO lc_table VALUES('x'); + +SELECT * +FROM lc_table +INNER JOIN lc_table AS lc_table2 ON lc_table.col = lc_table2.col; + +SELECT * +FROM lc_table +INNER JOIN lc_table AS lc_table2 ON CAST(lc_table.col AS String) = CAST(lc_table2.col AS String); + +SELECT * +FROM lc_table +INNER JOIN lc_table AS lc_table2 ON (lc_table.col = lc_table2.col) OR (lc_table.col = lc_table2.col); + +SELECT * +FROM lc_table +INNER JOIN lc_table AS lc_table2 ON (CAST(lc_table.col AS String) = CAST(lc_table2.col AS String)) OR (CAST(lc_table.col AS String) = CAST(lc_table2.col AS String)); + +DROP TABLE IF EXISTS lc_table; From 5103aefd3410c54b10e3892b1583bf7e9c1859a7 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 25 Mar 2022 14:33:54 +0100 Subject: [PATCH 075/111] Fix --- src/Interpreters/ExpressionAnalyzer.cpp | 19 +++++-------------- .../02006_test_positional_arguments.reference | 4 ++++ .../02006_test_positional_arguments.sql | 2 ++ 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 841d7bc567f..e2e0fe5287b 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -100,20 +100,9 @@ bool checkPositionalArguments(ASTPtr & argument, const ASTSelectQuery * select_q { auto columns = select_query->select()->children; - const auto * group_by_expr_with_alias = dynamic_cast(argument.get()); - if (group_by_expr_with_alias && !group_by_expr_with_alias->alias.empty()) - { - for (const auto & column : columns) - { - const auto * col_with_alias = dynamic_cast(column.get()); - if (col_with_alias) - { - const auto & alias = col_with_alias->alias; - if (!alias.empty() && alias == group_by_expr_with_alias->alias) - return false; - } - } - } + const auto * expr_with_alias = dynamic_cast(argument.get()); + if (expr_with_alias && !expr_with_alias->alias.empty()) + return false; const auto * ast_literal = typeid_cast(argument.get()); if (!ast_literal) @@ -1324,7 +1313,9 @@ ActionsDAGPtr SelectQueryExpressionAnalyzer::appendOrderBy(ExpressionActionsChai throw Exception("Bad ORDER BY expression AST", ErrorCodes::UNKNOWN_TYPE_OF_AST_NODE); if (getContext()->getSettingsRef().enable_positional_arguments) + { replaceForPositionalArguments(ast->children.at(0), select_query, ASTSelectQuery::Expression::ORDER_BY); + } } getRootActions(select_query->orderBy(), only_types, step.actions()); diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.reference b/tests/queries/0_stateless/02006_test_positional_arguments.reference index c5c5f115b0a..8a10823b9e7 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.reference +++ b/tests/queries/0_stateless/02006_test_positional_arguments.reference @@ -111,3 +111,7 @@ select substr('aaaaaaaaaaaaaa', 8) as a group by a; aaaaaaa select substr('aaaaaaaaaaaaaa', 8) as a group by substr('aaaaaaaaaaaaaa', 8); aaaaaaa +select b from (select 5 as a, 'Hello' as b order by a); +Hello +select b from (select 5 as a, 'Hello' as b group by a); +Hello diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.sql b/tests/queries/0_stateless/02006_test_positional_arguments.sql index 7442ca6bbf6..30cac810d27 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.sql +++ b/tests/queries/0_stateless/02006_test_positional_arguments.sql @@ -45,3 +45,5 @@ explain syntax select plus(1, 1) as a group by a; select substr('aaaaaaaaaaaaaa', 8) as a group by a; select substr('aaaaaaaaaaaaaa', 8) as a group by substr('aaaaaaaaaaaaaa', 8); +select b from (select 5 as a, 'Hello' as b order by a); +select b from (select 5 as a, 'Hello' as b group by a); From 9ec0951de589b2c27f05ec1707627176ceef586b Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 25 Mar 2022 15:54:05 +0100 Subject: [PATCH 076/111] update comment for mismatching checksums --- .../MergeTree/MergeFromLogEntryTask.cpp | 19 +++++++++++-------- .../MergeTree/MutateFromLogEntryTask.cpp | 3 ++- src/Storages/StorageReplicatedMergeTree.cpp | 12 ++++++++---- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp index 66356fd005b..68ffb42a90a 100644 --- a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp @@ -279,14 +279,17 @@ bool MergeFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrite ProfileEvents::increment(ProfileEvents::DataAfterMergeDiffersFromReplica); LOG_ERROR(log, - "{}. Data after merge is not byte-identical to data on another replicas. There could be several" - " reasons: 1. Using newer version of compression library after server update. 2. Using another" - " compression method. 3. Non-deterministic compression algorithm (highly unlikely). 4." - " Non-deterministic merge algorithm due to logical error in code. 5. Data corruption in memory due" - " to bug in code. 6. Data corruption in memory due to hardware issue. 7. Manual modification of" - " source data after server startup. 8. Manual modification of checksums stored in ZooKeeper. 9." - " Part format related settings like 'enable_mixed_granularity_parts' are different on different" - " replicas. We will download merged part from replica to force byte-identical result.", + "{}. Data after merge is not byte-identical to data on another replicas. There could be several reasons:" + " 1. Using newer version of compression library after server update." + " 2. Using another compression method." + " 3. Non-deterministic compression algorithm (highly unlikely)." + " 4. Non-deterministic merge algorithm due to logical error in code." + " 5. Data corruption in memory due to bug in code." + " 6. Data corruption in memory due to hardware issue." + " 7. Manual modification of source data after server startup." + " 8. Manual modification of checksums stored in ZooKeeper." + " 9. Part format related settings like 'enable_mixed_granularity_parts' are different on different replicas." + " We will download merged part from replica to force byte-identical result.", getCurrentExceptionMessage(false)); write_part_log(ExecutionStatus::fromCurrentException()); diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index 3f220566260..309432e4675 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -185,7 +185,8 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit ProfileEvents::increment(ProfileEvents::DataAfterMutationDiffersFromReplica); - LOG_ERROR(log, "{}. Data after mutation is not byte-identical to data on another replicas. We will download merged part from replica to force byte-identical result.", getCurrentExceptionMessage(false)); + LOG_ERROR(log, "{}. Data after mutation is not byte-identical to data on another replicas. " + "We will download merged part from replica to force byte-identical result.", getCurrentExceptionMessage(false)); write_part_log(ExecutionStatus::fromCurrentException()); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index b013b24f17b..d9f72cf7feb 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1312,10 +1312,14 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil: if (replica_part_header.getColumnsHash() != local_part_header.getColumnsHash()) { - /// Either it's a bug or ZooKeeper contains broken data. - /// TODO Fix KILL MUTATION and replace CHECKSUM_DOESNT_MATCH with LOGICAL_ERROR - /// (some replicas may skip killed mutation even if it was executed on other replicas) - throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, "Part {} from {} has different columns hash", part_name, replica); + /// Currently there are two (known) cases when it may happen: + /// - KILL MUTATION query had removed mutation before all replicas have executed assigned MUTATE_PART entries. + /// Some replicas may skip this mutation and update part version without actually applying any changes. + /// It leads to mismatching checksum if changes were applied on other replicas. + /// - ALTER_METADATA and MERGE_PARTS were reordered on some replicas. + /// It may lead to different number of columns in merged parts on these replicas. + throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, "Part {} from {} has different columns hash " + "(it may rarely happen on race condition with KILL MUTATION or ALTER COLUMN).", part_name, replica); } replica_part_header.getChecksums().checkEqual(local_part_header.getChecksums(), true); From 543c46d2fda34d2c61720a3ba6455f09e19cbcb4 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 25 Mar 2022 17:29:15 +0100 Subject: [PATCH 077/111] Fix dbg package --- packages/clickhouse-keeper-dbg.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/clickhouse-keeper-dbg.yaml b/packages/clickhouse-keeper-dbg.yaml index e5d9419eb89..685bff11080 100644 --- a/packages/clickhouse-keeper-dbg.yaml +++ b/packages/clickhouse-keeper-dbg.yaml @@ -19,8 +19,8 @@ description: | This package contains the debugging symbols for clickhouse-keeper. contents: -- src: root/usr/lib/debug/clickhouse-keeper.debug - dst: /usr/lib/debug/clickhouse-keeper.debug +- src: root/usr/lib/debug/usr/bin/clickhouse-keeper.debug + dst: /usr/lib/debug/usr/bin/clickhouse-keeper.debug # docs - src: ../AUTHORS dst: /usr/share/doc/clickhouse-keeper-dbg/AUTHORS From 2712368f78dfd0308c90ed13997d5b8631146eeb Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 25 Mar 2022 17:44:53 +0100 Subject: [PATCH 078/111] Push only to the new CI DB --- tests/ci/clickhouse_helper.py | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 9d8a7463b3e..7ccbcb4a47e 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -8,30 +8,16 @@ from get_robot_token import get_parameter_from_ssm class ClickHouseHelper: - def __init__(self, url=None, user=None, password=None): - self.url2 = None - self.auth2 = None - + def __init__(self, url=None): if url is None: - url = get_parameter_from_ssm("clickhouse-test-stat-url") - self.url2 = get_parameter_from_ssm("clickhouse-test-stat-url2") - self.auth2 = { + self.url = get_parameter_from_ssm("clickhouse-test-stat-url2") + self.auth = { "X-ClickHouse-User": get_parameter_from_ssm( "clickhouse-test-stat-login2" ), "X-ClickHouse-Key": "", } - self.url = url - self.auth = { - "X-ClickHouse-User": user - if user is not None - else get_parameter_from_ssm("clickhouse-test-stat-login"), - "X-ClickHouse-Key": password - if password is not None - else get_parameter_from_ssm("clickhouse-test-stat-password"), - } - @staticmethod def _insert_json_str_info_impl(url, auth, db, table, json_str): params = { @@ -78,8 +64,6 @@ class ClickHouseHelper: def _insert_json_str_info(self, db, table, json_str): self._insert_json_str_info_impl(self.url, self.auth, db, table, json_str) - if self.url2: - self._insert_json_str_info_impl(self.url2, self.auth2, db, table, json_str) def insert_event_into(self, db, table, event): event_str = json.dumps(event) From d8960cde625810901c211419a94bf273e4457fc3 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 25 Mar 2022 20:03:12 +0100 Subject: [PATCH 079/111] Fix positional order/etc by literal --- src/Interpreters/ExpressionAnalyzer.cpp | 2 +- .../0_stateless/02006_test_positional_arguments.reference | 2 ++ tests/queries/0_stateless/02006_test_positional_arguments.sql | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index e2e0fe5287b..5877ca35392 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -119,7 +119,7 @@ bool checkPositionalArguments(ASTPtr & argument, const ASTSelectQuery * select_q pos, columns.size()); const auto & column = columns[--pos]; - if (typeid_cast(column.get())) + if (typeid_cast(column.get()) || typeid_cast(column.get())) { argument = column->clone(); } diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.reference b/tests/queries/0_stateless/02006_test_positional_arguments.reference index 8a10823b9e7..f86a1ab6c47 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.reference +++ b/tests/queries/0_stateless/02006_test_positional_arguments.reference @@ -115,3 +115,5 @@ select b from (select 5 as a, 'Hello' as b order by a); Hello select b from (select 5 as a, 'Hello' as b group by a); Hello +select b from (select 5 as a, 'Hello' as b order by 1); +Hello diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.sql b/tests/queries/0_stateless/02006_test_positional_arguments.sql index 30cac810d27..2a02cd03c93 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.sql +++ b/tests/queries/0_stateless/02006_test_positional_arguments.sql @@ -47,3 +47,4 @@ select substr('aaaaaaaaaaaaaa', 8) as a group by substr('aaaaaaaaaaaaaa', 8); select b from (select 5 as a, 'Hello' as b order by a); select b from (select 5 as a, 'Hello' as b group by a); +select b from (select 5 as a, 'Hello' as b order by 1); From b3d80c7822cc47104012b0409d8d8ce091592b3b Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 25 Mar 2022 20:12:19 +0100 Subject: [PATCH 080/111] test From e4558b156846f2fe48be346626f15450575cb858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Sat, 26 Mar 2022 11:12:22 +0100 Subject: [PATCH 081/111] Update submodules libc++ to LLVM 14 --- .gitmodules | 6 +++--- contrib/hyperscan | 2 +- contrib/libcxx | 2 +- contrib/libcxxabi | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitmodules b/.gitmodules index bb028d63019..6c9e66f9cbc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -70,10 +70,10 @@ url = https://github.com/ClickHouse/libgsasl.git [submodule "contrib/libcxx"] path = contrib/libcxx - url = https://github.com/Algunenano/libcxx.git + url = https://github.com/ClickHouse/libcxx.git [submodule "contrib/libcxxabi"] path = contrib/libcxxabi - url = https://github.com/Algunenano/libcxxabi.git + url = https://github.com/ClickHouse/libcxxabi.git [submodule "contrib/snappy"] path = contrib/snappy url = https://github.com/ClickHouse/snappy.git @@ -88,7 +88,7 @@ url = https://github.com/ClickHouse/h3 [submodule "contrib/hyperscan"] path = contrib/hyperscan - url = https://github.com/Algunenano/hyperscan.git + url = https://github.com/ClickHouse/hyperscan.git [submodule "contrib/libunwind"] path = contrib/libunwind url = https://github.com/ClickHouse/libunwind.git diff --git a/contrib/hyperscan b/contrib/hyperscan index e2ac3060fd5..5edc68c5ac6 160000 --- a/contrib/hyperscan +++ b/contrib/hyperscan @@ -1 +1 @@ -Subproject commit e2ac3060fd5136953980d7599d7194f6779a06f6 +Subproject commit 5edc68c5ac68d2d4f876159e9ee84def6d3dc87c diff --git a/contrib/libcxx b/contrib/libcxx index f5374ce9484..172b2ae074f 160000 --- a/contrib/libcxx +++ b/contrib/libcxx @@ -1 +1 @@ -Subproject commit f5374ce9484dc4accff6aa39f307f377cecad906 +Subproject commit 172b2ae074f6755145b91c53a95c8540c1468239 diff --git a/contrib/libcxxabi b/contrib/libcxxabi index 8106144ae52..6eb7cc7a7bd 160000 --- a/contrib/libcxxabi +++ b/contrib/libcxxabi @@ -1 +1 @@ -Subproject commit 8106144ae5246d21783b80c3a0887b195805b438 +Subproject commit 6eb7cc7a7bdd779e6734d1b9fb451df2274462d7 From 22c9b540367f3df8ebe36ea2c10f1b6171dd7fc1 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sat, 26 Mar 2022 15:09:48 -0300 Subject: [PATCH 082/111] test for partition_by using ignore() --- .../02245_weird_partitions_pruning.reference | 14 +++++ .../02245_weird_partitions_pruning.sql | 61 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 tests/queries/0_stateless/02245_weird_partitions_pruning.reference create mode 100644 tests/queries/0_stateless/02245_weird_partitions_pruning.sql diff --git a/tests/queries/0_stateless/02245_weird_partitions_pruning.reference b/tests/queries/0_stateless/02245_weird_partitions_pruning.reference new file mode 100644 index 00000000000..cf406b417b4 --- /dev/null +++ b/tests/queries/0_stateless/02245_weird_partitions_pruning.reference @@ -0,0 +1,14 @@ +202112-0 (202112,0) +202201-0 (202201,0) +202301-0 (202301,0) +202112-0 2021-12-31 22:30:00 2021-12-31 22:30:00 2021-12-31 14:30:00 2021-12-31 14:30:00 1000 +202201-0 2022-01-01 00:30:00 2022-01-31 22:30:00 2021-12-31 16:30:00 2022-01-31 14:30:00 2000 +202301-0 2023-01-31 22:30:00 2023-01-31 22:30:00 2023-01-31 14:30:00 2023-01-31 14:30:00 1000 +202112-0 +default weird_partitions_02245 1 1000 1 +202201-0 +default weird_partitions_02245 1 2000 1 +202112-0 +202201-0 +default weird_partitions_02245 2 3000 2 +default weird_partitions_02245 0 0 0 diff --git a/tests/queries/0_stateless/02245_weird_partitions_pruning.sql b/tests/queries/0_stateless/02245_weird_partitions_pruning.sql new file mode 100644 index 00000000000..2c4b2dde0f4 --- /dev/null +++ b/tests/queries/0_stateless/02245_weird_partitions_pruning.sql @@ -0,0 +1,61 @@ +-- We use a hack - partition by ignore(d1). In some cases there are two columns +-- not fully correlated (<1) (date_begin - date_end or datetime - datetime_in_TZ_with_DST) +-- If we partition by these columns instead of one it will be twice more partitions. +-- Partition by (.., ignore(d1)) allows to partition by the first column but build +-- min_max indexes for both column, so partition pruning works for both columns. +-- It's very similar to min_max skip index but gives bigger performance boost, +-- because partition pruning happens on very early query stage. + + +DROP TABLE IF EXISTS weird_partitions_02245; + +CREATE TABLE weird_partitions_02245(d DateTime, d1 DateTime default d - toIntervalHour(8), id Int64) +Engine=MergeTree +PARTITION BY (toYYYYMM(toDateTime(d)), ignore(d1)) +ORDER BY id; + +INSERT INTO weird_partitions_02245(d, id) +SELECT + toDateTime('2021-12-31 22:30:00') AS d, + number +FROM numbers(1000); + +INSERT INTO weird_partitions_02245(d, id) +SELECT + toDateTime('2022-01-01 00:30:00') AS d, + number +FROM numbers(1000); + +INSERT INTO weird_partitions_02245(d, id) +SELECT + toDateTime('2022-01-31 22:30:00') AS d, + number +FROM numbers(1000); + +INSERT INTO weird_partitions_02245(d, id) +SELECT + toDateTime('2023-01-31 22:30:00') AS d, + number +FROM numbers(1000); + +OPTIMIZE TABLE weird_partitions_02245; +OPTIMIZE TABLE weird_partitions_02245; + +SELECT DISTINCT _partition_id, _partition_value FROM weird_partitions_02245 ORDER BY _partition_id ASC; + +SELECT _partition_id, min(d), max(d), min(d1), max(d1), count() FROM weird_partitions_02245 GROUP BY _partition_id ORDER BY _partition_id ASC; + +select DISTINCT _partition_id from weird_partitions_02245 where d >= '2021-12-31 00:00:00' and d < '2022-01-01 00:00:00'; +explain estimate select DISTINCT _partition_id from weird_partitions_02245 where d >= '2021-12-31 00:00:00' and d < '2022-01-01 00:00:00'; + +select DISTINCT _partition_id from weird_partitions_02245 where d >= '2022-01-01 00:00:00' and d1 >= '2021-12-31 00:00:00' and d1 < '2022-01-01 00:00:00'; +explain estimate select DISTINCT _partition_id from weird_partitions_02245 where d >= '2022-01-01 00:00:00' and d1 >= '2021-12-31 00:00:00' and d1 < '2022-01-01 00:00:00'; + +select DISTINCT _partition_id from weird_partitions_02245 where d1 >= '2021-12-31 00:00:00' and d1 < '2022-01-01 00:00:00'; +explain estimate select DISTINCT _partition_id from weird_partitions_02245 where d1 >= '2021-12-31 00:00:00' and d1 < '2022-01-01 00:00:00'; + +select DISTINCT _partition_id from weird_partitions_02245 where d >= '2022-01-01 00:00:00' and d1 >= '2021-12-31 00:00:00' and d1 < '2020-01-01 00:00:00'; +explain estimate select DISTINCT _partition_id from weird_partitions_02245 where d >= '2022-01-01 00:00:00' and d1 >= '2021-12-31 00:00:00' and d1 < '2020-01-01 00:00:00'; + +DROP TABLE weird_partitions_02245; + From b7b06e1b64a024a7758fa6ee9c7fd3af24d52016 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sat, 26 Mar 2022 16:43:57 -0300 Subject: [PATCH 083/111] fix order --- .../0_stateless/02245_weird_partitions_pruning.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/02245_weird_partitions_pruning.sql b/tests/queries/0_stateless/02245_weird_partitions_pruning.sql index 2c4b2dde0f4..6273a9f3d59 100644 --- a/tests/queries/0_stateless/02245_weird_partitions_pruning.sql +++ b/tests/queries/0_stateless/02245_weird_partitions_pruning.sql @@ -45,16 +45,16 @@ SELECT DISTINCT _partition_id, _partition_value FROM weird_partitions_02245 ORDE SELECT _partition_id, min(d), max(d), min(d1), max(d1), count() FROM weird_partitions_02245 GROUP BY _partition_id ORDER BY _partition_id ASC; -select DISTINCT _partition_id from weird_partitions_02245 where d >= '2021-12-31 00:00:00' and d < '2022-01-01 00:00:00'; +select DISTINCT _partition_id from weird_partitions_02245 where d >= '2021-12-31 00:00:00' and d < '2022-01-01 00:00:00' ORDER BY _partition_id; explain estimate select DISTINCT _partition_id from weird_partitions_02245 where d >= '2021-12-31 00:00:00' and d < '2022-01-01 00:00:00'; -select DISTINCT _partition_id from weird_partitions_02245 where d >= '2022-01-01 00:00:00' and d1 >= '2021-12-31 00:00:00' and d1 < '2022-01-01 00:00:00'; +select DISTINCT _partition_id from weird_partitions_02245 where d >= '2022-01-01 00:00:00' and d1 >= '2021-12-31 00:00:00' and d1 < '2022-01-01 00:00:00' ORDER BY _partition_id;; explain estimate select DISTINCT _partition_id from weird_partitions_02245 where d >= '2022-01-01 00:00:00' and d1 >= '2021-12-31 00:00:00' and d1 < '2022-01-01 00:00:00'; -select DISTINCT _partition_id from weird_partitions_02245 where d1 >= '2021-12-31 00:00:00' and d1 < '2022-01-01 00:00:00'; +select DISTINCT _partition_id from weird_partitions_02245 where d1 >= '2021-12-31 00:00:00' and d1 < '2022-01-01 00:00:00' ORDER BY _partition_id;; explain estimate select DISTINCT _partition_id from weird_partitions_02245 where d1 >= '2021-12-31 00:00:00' and d1 < '2022-01-01 00:00:00'; -select DISTINCT _partition_id from weird_partitions_02245 where d >= '2022-01-01 00:00:00' and d1 >= '2021-12-31 00:00:00' and d1 < '2020-01-01 00:00:00'; +select DISTINCT _partition_id from weird_partitions_02245 where d >= '2022-01-01 00:00:00' and d1 >= '2021-12-31 00:00:00' and d1 < '2020-01-01 00:00:00' ORDER BY _partition_id;; explain estimate select DISTINCT _partition_id from weird_partitions_02245 where d >= '2022-01-01 00:00:00' and d1 >= '2021-12-31 00:00:00' and d1 < '2020-01-01 00:00:00'; DROP TABLE weird_partitions_02245; From 6780defb42014348eab758f2fb4cfffa93564584 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 26 Mar 2022 22:45:45 +0100 Subject: [PATCH 084/111] Fix build report check --- tests/ci/build_report_check.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index 5afe2991073..5946c00cc8a 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -148,6 +148,13 @@ if __name__ == "__main__": build_name, ) + some_builds_are_missing = len(build_reports_map) < len(reports_order) + + if some_builds_are_missing: + logging.info("Expected to get %s build results, got %s", len(reports_order), len(build_reports_map)) + else: + logging.info("Got exactly %s builds", len(build_reports_map)) + build_reports = [ build_reports_map[build_name] for build_name in reports_order @@ -219,10 +226,10 @@ if __name__ == "__main__": if build_result.status == "success": ok_builds += 1 - if ok_builds == 0: + if ok_builds == 0 or some_builds_are_missing: summary_status = "error" - description = "{}/{} builds are OK".format(ok_builds, total_builds) + description = f"{ok_builds}/{total_builds} builds are OK" print("::notice ::Report url: {}".format(url)) From bb35184da1e3a78f6eb673f5d91fc5de045f2e67 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 28 Mar 2022 02:04:19 +0200 Subject: [PATCH 085/111] Add metric about size of async INSERTs --- src/Access/Common/QuotaDefs.h | 2 +- src/Common/ProfileEvents.cpp | 1 + src/Interpreters/AsynchronousInsertQueue.cpp | 6 +++++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Access/Common/QuotaDefs.h b/src/Access/Common/QuotaDefs.h index cfd8a07d9ff..7a69f811ea5 100644 --- a/src/Access/Common/QuotaDefs.h +++ b/src/Access/Common/QuotaDefs.h @@ -13,7 +13,7 @@ enum class QuotaType { QUERIES, /// Number of queries. QUERY_SELECTS, /// Number of select queries. - QUERY_INSERTS, /// Number of inserts queries. + QUERY_INSERTS, /// Number of insert queries. ERRORS, /// Number of queries with exceptions. RESULT_ROWS, /// Number of rows returned as result. RESULT_BYTES, /// Number of bytes returned as result. diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 178559894e3..dea2989d570 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -9,6 +9,7 @@ M(SelectQuery, "Same as Query, but only for SELECT queries.") \ M(InsertQuery, "Same as Query, but only for INSERT queries.") \ M(AsyncInsertQuery, "Same as InsertQuery, but only for asynchronous INSERT queries.") \ + M(AsyncInsertBytes, "Number of bytes in inline data of asynchronous INSERT queries.") \ M(FailedQuery, "Number of failed queries.") \ M(FailedSelectQuery, "Same as FailedQuery, but only for SELECT queries.") \ M(FailedInsertQuery, "Same as FailedQuery, but only for INSERT queries.") \ diff --git a/src/Interpreters/AsynchronousInsertQueue.cpp b/src/Interpreters/AsynchronousInsertQueue.cpp index c60ab0f6510..6102066f85b 100644 --- a/src/Interpreters/AsynchronousInsertQueue.cpp +++ b/src/Interpreters/AsynchronousInsertQueue.cpp @@ -32,6 +32,7 @@ namespace CurrentMetrics namespace ProfileEvents { extern const Event AsyncInsertQuery; + extern const Event AsyncInsertBytes; } namespace DB @@ -222,7 +223,9 @@ void AsynchronousInsertQueue::pushImpl(InsertData::EntryPtr entry, QueueIterator if (!data) data = std::make_unique(); - data->size += entry->bytes.size(); + size_t entry_data_size = entry->bytes.size(); + + data->size += entry_data_size; data->last_update = std::chrono::steady_clock::now(); data->entries.emplace_back(entry); @@ -239,6 +242,7 @@ void AsynchronousInsertQueue::pushImpl(InsertData::EntryPtr entry, QueueIterator CurrentMetrics::add(CurrentMetrics::PendingAsyncInsert); ProfileEvents::increment(ProfileEvents::AsyncInsertQuery); + ProfileEvents::increment(ProfileEvents::AsyncInsertBytes, entry_data_size); } void AsynchronousInsertQueue::waitForProcessingQuery(const String & query_id, const Milliseconds & timeout) From 6fbf5bd9adb9a501cdfb041ffc780efa22d9ba5c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 28 Mar 2022 02:11:27 +0200 Subject: [PATCH 086/111] More correct --- src/Common/ProfileEvents.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index dea2989d570..e1bf8a37ee7 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -9,7 +9,7 @@ M(SelectQuery, "Same as Query, but only for SELECT queries.") \ M(InsertQuery, "Same as Query, but only for INSERT queries.") \ M(AsyncInsertQuery, "Same as InsertQuery, but only for asynchronous INSERT queries.") \ - M(AsyncInsertBytes, "Number of bytes in inline data of asynchronous INSERT queries.") \ + M(AsyncInsertBytes, "Data size in bytes of asynchronous INSERT queries.") \ M(FailedQuery, "Number of failed queries.") \ M(FailedSelectQuery, "Same as FailedQuery, but only for SELECT queries.") \ M(FailedInsertQuery, "Same as FailedQuery, but only for INSERT queries.") \ From c4832f7633ac7d47369bd274e6251d3675790193 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 28 Mar 2022 09:28:02 +0200 Subject: [PATCH 087/111] Fix --- .../MaterializedPostgreSQLConsumer.cpp | 40 +++++++++++++++---- .../MaterializedPostgreSQLConsumer.h | 2 + .../PostgreSQLReplicationHandler.cpp | 16 ++++---- .../PostgreSQL/PostgreSQLReplicationHandler.h | 3 +- 4 files changed, 43 insertions(+), 18 deletions(-) diff --git a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp index 5b963a544c8..e3aa4ff82a5 100644 --- a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp +++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp @@ -98,8 +98,24 @@ MaterializedPostgreSQLConsumer::StorageData::Buffer::Buffer( } +void MaterializedPostgreSQLConsumer::assertCorrectInsertion(StorageData::Buffer & buffer, size_t column_idx) +{ + if (column_idx >= buffer.description.sample_block.columns() + || column_idx >= buffer.description.types.size() + || column_idx >= buffer.columns.size()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Attempt to insert into buffer at position: {}, but block columns size is {}, types size: {}, columns size: {}, buffer structure: {}", + column_idx, + buffer.description.sample_block.columns(), buffer.description.types.size(), buffer.columns.size(), + buffer.description.sample_block.dumpStructure()); +} + + void MaterializedPostgreSQLConsumer::insertValue(StorageData::Buffer & buffer, const std::string & value, size_t column_idx) { + assertCorrectInsertion(buffer, column_idx); + const auto & sample = buffer.description.sample_block.getByPosition(column_idx); bool is_nullable = buffer.description.types[column_idx].second; @@ -134,6 +150,8 @@ void MaterializedPostgreSQLConsumer::insertValue(StorageData::Buffer & buffer, c void MaterializedPostgreSQLConsumer::insertDefaultValue(StorageData::Buffer & buffer, size_t column_idx) { + assertCorrectInsertion(buffer, column_idx); + const auto & sample = buffer.description.sample_block.getByPosition(column_idx); insertDefaultPostgreSQLValue(*buffer.columns[column_idx], *sample.column); } @@ -515,13 +533,14 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl void MaterializedPostgreSQLConsumer::syncTables() { - try + for (const auto & table_name : tables_to_sync) { - for (const auto & table_name : tables_to_sync) - { - auto & storage_data = storages.find(table_name)->second; - Block result_rows = storage_data.buffer.description.sample_block.cloneWithColumns(std::move(storage_data.buffer.columns)); + auto & storage_data = storages.find(table_name)->second; + Block result_rows = storage_data.buffer.description.sample_block.cloneWithColumns(std::move(storage_data.buffer.columns)); + storage_data.buffer.columns = storage_data.buffer.description.sample_block.cloneEmptyColumns(); + try + { if (result_rows.rows()) { auto storage = storage_data.storage; @@ -543,13 +562,18 @@ void MaterializedPostgreSQLConsumer::syncTables() CompletedPipelineExecutor executor(io.pipeline); executor.execute(); - - storage_data.buffer.columns = storage_data.buffer.description.sample_block.cloneEmptyColumns(); } } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } - LOG_DEBUG(log, "Table sync end for {} tables, last lsn: {} = {}, (attempted lsn {})", tables_to_sync.size(), current_lsn, getLSNValue(current_lsn), getLSNValue(final_lsn)); + LOG_DEBUG(log, "Table sync end for {} tables, last lsn: {} = {}, (attempted lsn {})", tables_to_sync.size(), current_lsn, getLSNValue(current_lsn), getLSNValue(final_lsn)); + try + { auto tx = std::make_shared(connection->getRef()); current_lsn = advanceLSN(tx); tables_to_sync.clear(); diff --git a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.h b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.h index a01f9394190..5193feee708 100644 --- a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.h +++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.h @@ -122,6 +122,8 @@ private: void markTableAsSkipped(Int32 relation_id, const String & relation_name); + static void assertCorrectInsertion(StorageData::Buffer & buffer, size_t column_idx); + /// lsn - log sequnce nuumber, like wal offset (64 bit). static Int64 getLSNValue(const std::string & lsn) { diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp index 876ba9b1698..fa90295bcd6 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp @@ -64,8 +64,8 @@ PostgreSQLReplicationHandler::PostgreSQLReplicationHandler( bool is_attach_, const MaterializedPostgreSQLSettings & replication_settings, bool is_materialized_postgresql_database_) - : log(&Poco::Logger::get("PostgreSQLReplicationHandler")) - , context(context_) + : WithContext(context_->getGlobalContext()) + , log(&Poco::Logger::get("PostgreSQLReplicationHandler")) , is_attach(is_attach_) , postgres_database(postgres_database_) , postgres_schema(replication_settings.materialized_postgresql_schema) @@ -94,9 +94,9 @@ PostgreSQLReplicationHandler::PostgreSQLReplicationHandler( } publication_name = fmt::format("{}_ch_publication", replication_identifier); - startup_task = context->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ checkConnectionAndStart(); }); - consumer_task = context->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ consumerFunc(); }); - cleanup_task = context->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ cleanupFunc(); }); + startup_task = getContext()->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ checkConnectionAndStart(); }); + consumer_task = getContext()->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ consumerFunc(); }); + cleanup_task = getContext()->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ cleanupFunc(); }); } @@ -296,7 +296,7 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error) /// (Apart from the case, when shutdownFinal is called). /// Handler uses it only for loadFromSnapshot and shutdown methods. consumer = std::make_shared( - context, + getContext(), std::move(tmp_connection), replication_slot, publication_name, @@ -921,9 +921,9 @@ void PostgreSQLReplicationHandler::reloadFromSnapshot(const std::vectoras (); - auto materialized_table_lock = materialized_storage->lockForShare(String(), context->getSettingsRef().lock_acquire_timeout); + auto materialized_table_lock = materialized_storage->lockForShare(String(), getContext()->getSettingsRef().lock_acquire_timeout); /// If for some reason this temporary table already exists - also drop it. auto temp_materialized_storage = materialized_storage->createTemporary(); diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h index 263095ec9c2..16e531f5247 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h @@ -13,7 +13,7 @@ namespace DB class StorageMaterializedPostgreSQL; struct SettingChange; -class PostgreSQLReplicationHandler +class PostgreSQLReplicationHandler : WithContext { friend class TemporaryReplicationSlot; @@ -98,7 +98,6 @@ private: std::pair getSchemaAndTableName(const String & table_name) const; Poco::Logger * log; - ContextPtr context; /// If it is not attach, i.e. a create query, then if publication already exists - always drop it. bool is_attach; From 98ad78f0a9b737123ca1a1bbcfd3f6513492538a Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 28 Mar 2022 09:41:56 +0200 Subject: [PATCH 088/111] Ping CI --- .../1_stateful/02226_s3_with_cache.reference | 2 + .../1_stateful/02226_s3_with_cache.sql | 42 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 tests/queries/1_stateful/02226_s3_with_cache.reference create mode 100644 tests/queries/1_stateful/02226_s3_with_cache.sql diff --git a/tests/queries/1_stateful/02226_s3_with_cache.reference b/tests/queries/1_stateful/02226_s3_with_cache.reference new file mode 100644 index 00000000000..c5149d11cab --- /dev/null +++ b/tests/queries/1_stateful/02226_s3_with_cache.reference @@ -0,0 +1,2 @@ +SELECT 1, * FROM test LIMIT 10 FORMAT Null; 2097152 2097152 0 +SELECT 2, * FROM test LIMIT 10 FORMAT Null; 0 2097152 0 diff --git a/tests/queries/1_stateful/02226_s3_with_cache.sql b/tests/queries/1_stateful/02226_s3_with_cache.sql new file mode 100644 index 00000000000..e30088f8dba --- /dev/null +++ b/tests/queries/1_stateful/02226_s3_with_cache.sql @@ -0,0 +1,42 @@ +# Tags: no-parallel, no-fasttest + +-- CREATE TABLE test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache'; +-- INSERT INTO test SELECT * FROM generateRandom('key UInt32, value String') LIMIT 100000000; + +SET max_memory_usage='15G'; + +SELECT 1, * FROM test LIMIT 10 FORMAT Null; + +SYSTEM FLUSH LOGS; +SELECT query, + ProfileEvents['RemoteFSReadBytes'] > 0 as remote_fs_read, + ProfileEvents['RemoteFSCacheReadBytes'] > 0 as remote_fs_cache_read, + ProfileEvents['RemoteFSCacheDownloadedBytes'] > 0 as remote_fs_read_and_download +FROM system.query_log +WHERE query LIKE 'SELECT 1, * FROM test LIMIT%' +AND type = 'QueryFinish' +AND current_database = currentDatabase() +ORDER BY query_start_time DESC +LIMIT 1; + +SELECT * FROM test WHERE value LIKE '%abc%' ORDER BY value LIMIT 10 FORMAT Null; +SELECT * FROM test ORDER BY value LIMIT 10 FORMAT Null; +SELECT * FROM test WHERE value LIKE '%dba%' ORDER BY value LIMIT 10 FORMAT Null; + +SET remote_filesystem_read_method='read'; + +SELECT 2, * FROM test LIMIT 10 FORMAT Null; + +SYSTEM FLUSH LOGS; +SELECT query, + ProfileEvents['RemoteFSReadBytes'] > 0 as remote_fs_read, + ProfileEvents['RemoteFSCacheReadBytes'] > 0 as remote_fs_cache_read, + ProfileEvents['RemoteFSCacheDownloadedBytes'] > 0 as remote_fs_read_and_download +FROM system.query_log +WHERE query LIKE 'SELECT 2, * FROM test LIMIT%' +AND type = 'QueryFinish' +AND current_database = currentDatabase() +ORDER BY query_start_time DESC +LIMIT 1; + +-- DROP TABLE test NO DELAY; From 08db628cc0269501d222d9c89bfa9b96213ca261 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 11:18:11 +0200 Subject: [PATCH 089/111] Update packages/clickhouse-keeper.yaml Co-authored-by: Mikhail f. Shiryaev --- packages/clickhouse-keeper.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/clickhouse-keeper.yaml b/packages/clickhouse-keeper.yaml index e717ba79c5b..e99ac30f944 100644 --- a/packages/clickhouse-keeper.yaml +++ b/packages/clickhouse-keeper.yaml @@ -14,6 +14,8 @@ conflicts: - clickhouse-server depends: - adduser +suggests: +- clickhouse-keeper-dbg maintainer: "ClickHouse Dev Team " description: | From 37e1160c678bfacf220880aaf560b71a49d4d7fa Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 11:18:16 +0200 Subject: [PATCH 090/111] Update packages/clickhouse-keeper-dbg.yaml Co-authored-by: Mikhail f. Shiryaev --- packages/clickhouse-keeper-dbg.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/packages/clickhouse-keeper-dbg.yaml b/packages/clickhouse-keeper-dbg.yaml index 685bff11080..2c70b7ad4aa 100644 --- a/packages/clickhouse-keeper-dbg.yaml +++ b/packages/clickhouse-keeper-dbg.yaml @@ -9,10 +9,6 @@ homepage: "https://clickhouse.com" license: "Apache" section: "database" priority: "optional" - -conflicts: -- clickhouse-server - maintainer: "ClickHouse Dev Team " description: | debugging symbols for clickhouse-keeper From 77e700b1cfa293ad95f06e3ab4a3e2da040a616c Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 11:25:08 +0200 Subject: [PATCH 091/111] Update src/Compression/CompressionFactory.cpp --- src/Compression/CompressionFactory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Compression/CompressionFactory.cpp b/src/Compression/CompressionFactory.cpp index 8dfc894e15b..abf5e38a8c3 100644 --- a/src/Compression/CompressionFactory.cpp +++ b/src/Compression/CompressionFactory.cpp @@ -168,7 +168,7 @@ void registerCodecZSTD(CompressionCodecFactory & factory); void registerCodecMultiple(CompressionCodecFactory & factory); -/// Keeper use only general-purpose codes, so we don't need these special codecs +/// Keeper use only general-purpose codecs, so we don't need these special codecs /// in standalone build #ifndef KEEPER_STANDALONE_BUILD From 47517970cef7cc23dea58e5051d99196d27fe169 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 11:56:37 +0200 Subject: [PATCH 092/111] Bump CI From fe86805e5d5d9567f1a304a191c358e34cd5cd1b Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 28 Mar 2022 12:14:56 +0200 Subject: [PATCH 093/111] fix flaky test --- ...mary_key_without_order_by_zookeeper.reference | 4 ++-- ...32_primary_key_without_order_by_zookeeper.sql | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.reference b/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.reference index 66aaf09f4d9..0c1d3ae33ac 100644 --- a/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.reference +++ b/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.reference @@ -9,8 +9,8 @@ CREATE TABLE default.merge_tree_pk_sql\n(\n `key` UInt64,\n `value` String 1 c 2 b 1 c 0 -2 e 555 2 b 0 +2 e 555 CREATE TABLE default.merge_tree_pk_sql\n(\n `key` UInt64,\n `value` String,\n `key2` UInt64\n)\nENGINE = ReplacingMergeTree\nPRIMARY KEY key\nORDER BY (key, key2)\nSETTINGS index_granularity = 8192 CREATE TABLE default.replicated_merge_tree_pk_sql\n(\n `key` UInt64,\n `value` String\n)\nENGINE = ReplicatedReplacingMergeTree(\'/clickhouse/test/01532_primary_key_without\', \'r1\')\nPRIMARY KEY key\nORDER BY key\nSETTINGS index_granularity = 8192 1 a @@ -18,6 +18,6 @@ CREATE TABLE default.replicated_merge_tree_pk_sql\n(\n `key` UInt64,\n `va 1 c 2 b 1 c 0 -2 e 555 2 b 0 +2 e 555 CREATE TABLE default.replicated_merge_tree_pk_sql\n(\n `key` UInt64,\n `value` String,\n `key2` UInt64\n)\nENGINE = ReplicatedReplacingMergeTree(\'/clickhouse/test/01532_primary_key_without\', \'r1\')\nPRIMARY KEY key\nORDER BY (key, key2)\nSETTINGS index_granularity = 8192 diff --git a/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.sql b/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.sql index 8d413cf6c35..d744e56c482 100644 --- a/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.sql +++ b/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.sql @@ -15,14 +15,14 @@ SHOW CREATE TABLE merge_tree_pk; INSERT INTO merge_tree_pk VALUES (1, 'a'); INSERT INTO merge_tree_pk VALUES (2, 'b'); -SELECT * FROM merge_tree_pk ORDER BY key; +SELECT * FROM merge_tree_pk ORDER BY key, value; INSERT INTO merge_tree_pk VALUES (1, 'c'); DETACH TABLE merge_tree_pk; ATTACH TABLE merge_tree_pk; -SELECT * FROM merge_tree_pk FINAL ORDER BY key; +SELECT * FROM merge_tree_pk FINAL ORDER BY key, value; DROP TABLE IF EXISTS merge_tree_pk; @@ -41,14 +41,14 @@ SHOW CREATE TABLE merge_tree_pk_sql; INSERT INTO merge_tree_pk_sql VALUES (1, 'a'); INSERT INTO merge_tree_pk_sql VALUES (2, 'b'); -SELECT * FROM merge_tree_pk_sql ORDER BY key; +SELECT * FROM merge_tree_pk_sql ORDER BY key, value; INSERT INTO merge_tree_pk_sql VALUES (1, 'c'); DETACH TABLE merge_tree_pk_sql; ATTACH TABLE merge_tree_pk_sql; -SELECT * FROM merge_tree_pk_sql FINAL ORDER BY key; +SELECT * FROM merge_tree_pk_sql FINAL ORDER BY key, value; ALTER TABLE merge_tree_pk_sql ADD COLUMN key2 UInt64, MODIFY ORDER BY (key, key2); @@ -56,7 +56,7 @@ INSERT INTO merge_tree_pk_sql VALUES (2, 'd', 555); INSERT INTO merge_tree_pk_sql VALUES (2, 'e', 555); -SELECT * FROM merge_tree_pk_sql FINAL ORDER BY key; +SELECT * FROM merge_tree_pk_sql FINAL ORDER BY key, value; SHOW CREATE TABLE merge_tree_pk_sql; @@ -77,14 +77,14 @@ SHOW CREATE TABLE replicated_merge_tree_pk_sql; INSERT INTO replicated_merge_tree_pk_sql VALUES (1, 'a'); INSERT INTO replicated_merge_tree_pk_sql VALUES (2, 'b'); -SELECT * FROM replicated_merge_tree_pk_sql ORDER BY key; +SELECT * FROM replicated_merge_tree_pk_sql ORDER BY key, value; INSERT INTO replicated_merge_tree_pk_sql VALUES (1, 'c'); DETACH TABLE replicated_merge_tree_pk_sql; ATTACH TABLE replicated_merge_tree_pk_sql; -SELECT * FROM replicated_merge_tree_pk_sql FINAL ORDER BY key; +SELECT * FROM replicated_merge_tree_pk_sql FINAL ORDER BY key, value; ALTER TABLE replicated_merge_tree_pk_sql ADD COLUMN key2 UInt64, MODIFY ORDER BY (key, key2); @@ -92,7 +92,7 @@ INSERT INTO replicated_merge_tree_pk_sql VALUES (2, 'd', 555); INSERT INTO replicated_merge_tree_pk_sql VALUES (2, 'e', 555); -SELECT * FROM replicated_merge_tree_pk_sql FINAL ORDER BY key; +SELECT * FROM replicated_merge_tree_pk_sql FINAL ORDER BY key, value; DETACH TABLE replicated_merge_tree_pk_sql; ATTACH TABLE replicated_merge_tree_pk_sql; From c7fc1bd4dd386bcc89c9b1077e49b6f5e9f25f2f Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 12:16:41 +0200 Subject: [PATCH 094/111] Fix flaky 01307_multiple_leaders --- tests/queries/0_stateless/01307_multiple_leaders_zookeeper.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01307_multiple_leaders_zookeeper.sh b/tests/queries/0_stateless/01307_multiple_leaders_zookeeper.sh index 2a41d90cd3a..db986e74b6b 100755 --- a/tests/queries/0_stateless/01307_multiple_leaders_zookeeper.sh +++ b/tests/queries/0_stateless/01307_multiple_leaders_zookeeper.sh @@ -20,7 +20,8 @@ function thread() REPLICA=$1 ITERATIONS=$2 - $CLICKHOUSE_CLIENT --max_block_size 1 --min_insert_block_size_rows 0 --min_insert_block_size_bytes 0 --query "INSERT INTO r$REPLICA SELECT number * $NUM_REPLICAS + $REPLICA FROM numbers($ITERATIONS)" + # It's legal to fetch something before insert finished + $CLICKHOUSE_CLIENT --max_block_size 1 --min_insert_block_size_rows 0 --min_insert_block_size_bytes 0 --query "INSERT INTO r$REPLICA SELECT number * $NUM_REPLICAS + $REPLICA FROM numbers($ITERATIONS)" 2>&1 | grep -v -F "Tried to commit obsolete part" } for REPLICA in $SEQ; do From dbeba322fdb77bf81e7e85bbf3e42ff50348d6d2 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 28 Mar 2022 12:51:26 +0200 Subject: [PATCH 095/111] Delete 02226_s3_with_cache.sql --- .../1_stateful/02226_s3_with_cache.sql | 42 ------------------- 1 file changed, 42 deletions(-) delete mode 100644 tests/queries/1_stateful/02226_s3_with_cache.sql diff --git a/tests/queries/1_stateful/02226_s3_with_cache.sql b/tests/queries/1_stateful/02226_s3_with_cache.sql deleted file mode 100644 index e30088f8dba..00000000000 --- a/tests/queries/1_stateful/02226_s3_with_cache.sql +++ /dev/null @@ -1,42 +0,0 @@ -# Tags: no-parallel, no-fasttest - --- CREATE TABLE test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache'; --- INSERT INTO test SELECT * FROM generateRandom('key UInt32, value String') LIMIT 100000000; - -SET max_memory_usage='15G'; - -SELECT 1, * FROM test LIMIT 10 FORMAT Null; - -SYSTEM FLUSH LOGS; -SELECT query, - ProfileEvents['RemoteFSReadBytes'] > 0 as remote_fs_read, - ProfileEvents['RemoteFSCacheReadBytes'] > 0 as remote_fs_cache_read, - ProfileEvents['RemoteFSCacheDownloadedBytes'] > 0 as remote_fs_read_and_download -FROM system.query_log -WHERE query LIKE 'SELECT 1, * FROM test LIMIT%' -AND type = 'QueryFinish' -AND current_database = currentDatabase() -ORDER BY query_start_time DESC -LIMIT 1; - -SELECT * FROM test WHERE value LIKE '%abc%' ORDER BY value LIMIT 10 FORMAT Null; -SELECT * FROM test ORDER BY value LIMIT 10 FORMAT Null; -SELECT * FROM test WHERE value LIKE '%dba%' ORDER BY value LIMIT 10 FORMAT Null; - -SET remote_filesystem_read_method='read'; - -SELECT 2, * FROM test LIMIT 10 FORMAT Null; - -SYSTEM FLUSH LOGS; -SELECT query, - ProfileEvents['RemoteFSReadBytes'] > 0 as remote_fs_read, - ProfileEvents['RemoteFSCacheReadBytes'] > 0 as remote_fs_cache_read, - ProfileEvents['RemoteFSCacheDownloadedBytes'] > 0 as remote_fs_read_and_download -FROM system.query_log -WHERE query LIKE 'SELECT 2, * FROM test LIMIT%' -AND type = 'QueryFinish' -AND current_database = currentDatabase() -ORDER BY query_start_time DESC -LIMIT 1; - --- DROP TABLE test NO DELAY; From 3887fefb6b3ec6576f5dd2bcb9252836c43478ab Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 28 Mar 2022 12:51:35 +0200 Subject: [PATCH 096/111] Delete 02226_s3_with_cache.reference --- tests/queries/1_stateful/02226_s3_with_cache.reference | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 tests/queries/1_stateful/02226_s3_with_cache.reference diff --git a/tests/queries/1_stateful/02226_s3_with_cache.reference b/tests/queries/1_stateful/02226_s3_with_cache.reference deleted file mode 100644 index c5149d11cab..00000000000 --- a/tests/queries/1_stateful/02226_s3_with_cache.reference +++ /dev/null @@ -1,2 +0,0 @@ -SELECT 1, * FROM test LIMIT 10 FORMAT Null; 2097152 2097152 0 -SELECT 2, * FROM test LIMIT 10 FORMAT Null; 0 2097152 0 From 292a2594185fe77a3d253b57f045296f1f7c5b6a Mon Sep 17 00:00:00 2001 From: tavplubix Date: Mon, 28 Mar 2022 13:51:36 +0300 Subject: [PATCH 097/111] Update 01091_num_threads.sql --- tests/queries/0_stateless/01091_num_threads.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/01091_num_threads.sql b/tests/queries/0_stateless/01091_num_threads.sql index e32d663880f..faeceb0e6d6 100644 --- a/tests/queries/0_stateless/01091_num_threads.sql +++ b/tests/queries/0_stateless/01091_num_threads.sql @@ -1,5 +1,6 @@ set log_queries=1; set log_query_threads=1; +set max_threads=0; WITH 01091 AS id SELECT 1; SYSTEM FLUSH LOGS; From 1122db89dbdc835d964841587aabf5fd1c4369d6 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 28 Mar 2022 11:26:42 +0000 Subject: [PATCH 098/111] Use float devision for avg after optimize_fuse_sum_count_avg --- src/Interpreters/TreeRewriter.cpp | 5 ++++- tests/queries/0_stateless/02244_issue_35598_fuse.reference | 2 ++ tests/queries/0_stateless/02244_issue_35598_fuse.sql | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02244_issue_35598_fuse.reference create mode 100644 tests/queries/0_stateless/02244_issue_35598_fuse.sql diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 78e7ed33f8f..929e516f687 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -345,7 +345,10 @@ void replaceWithSumCount(String column_name, ASTFunction & func) { /// Rewrite "avg" to sumCount().1 / sumCount().2 auto new_arg1 = makeASTFunction("tupleElement", func_base, std::make_shared(UInt8(1))); - auto new_arg2 = makeASTFunction("tupleElement", func_base, std::make_shared(UInt8(2))); + auto new_arg2 = makeASTFunction("CAST", + makeASTFunction("tupleElement", func_base, std::make_shared(UInt8(2))), + std::make_shared("Float64")); + func.name = "divide"; exp_list->children.push_back(new_arg1); exp_list->children.push_back(new_arg2); diff --git a/tests/queries/0_stateless/02244_issue_35598_fuse.reference b/tests/queries/0_stateless/02244_issue_35598_fuse.reference new file mode 100644 index 00000000000..6ce84b402a3 --- /dev/null +++ b/tests/queries/0_stateless/02244_issue_35598_fuse.reference @@ -0,0 +1,2 @@ +0 0 nan +0 0 nan diff --git a/tests/queries/0_stateless/02244_issue_35598_fuse.sql b/tests/queries/0_stateless/02244_issue_35598_fuse.sql new file mode 100644 index 00000000000..a590854eb6c --- /dev/null +++ b/tests/queries/0_stateless/02244_issue_35598_fuse.sql @@ -0,0 +1,5 @@ +SELECT sum(x), count(x), avg(x) FROM (SELECT number :: Decimal32(0) AS x FROM numbers(0)) +SETTINGS optimize_syntax_fuse_functions = 0, optimize_fuse_sum_count_avg = 0; + +SELECT sum(x), count(x), avg(x) FROM (SELECT number :: Decimal32(0) AS x FROM numbers(0)) +SETTINGS optimize_syntax_fuse_functions = 1, optimize_fuse_sum_count_avg = 1; From 31fa28a4c2cf189b84926e3259f69adf394de1b6 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 28 Mar 2022 13:52:35 +0200 Subject: [PATCH 099/111] Improve black check: show diff in the output --- utils/check-style/check-black | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/check-style/check-black b/utils/check-style/check-black index 1f0be9375c2..45e7820469b 100755 --- a/utils/check-style/check-black +++ b/utils/check-style/check-black @@ -6,7 +6,7 @@ set -e GIT_ROOT=$(git rev-parse --show-cdup) GIT_ROOT=${GIT_ROOT:-.} tmp=$(mktemp) -if ! find "$GIT_ROOT" -name '*.py' -not -path "$GIT_ROOT/contrib/*" -exec black --check {} + 1>"$tmp" 2>&1; then +if ! find "$GIT_ROOT" -name '*.py' -not -path "$GIT_ROOT/contrib/*" -exec black --check --diff {} + 1>"$tmp" 2>&1; then # Show the result only if some files need formatting cat "$tmp" fi From 13831120da7a266444d3405e714c5a75876ac784 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 14:32:53 +0200 Subject: [PATCH 100/111] Fix restart repliacs test --- ...estart_replicas_rename_deadlock_zookeeper.sh | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/tests/queries/0_stateless/01108_restart_replicas_rename_deadlock_zookeeper.sh b/tests/queries/0_stateless/01108_restart_replicas_rename_deadlock_zookeeper.sh index abd5c0d6a4f..7b065e87f8a 100755 --- a/tests/queries/0_stateless/01108_restart_replicas_rename_deadlock_zookeeper.sh +++ b/tests/queries/0_stateless/01108_restart_replicas_rename_deadlock_zookeeper.sh @@ -38,27 +38,18 @@ function restart_replicas_loop() done sleep 0.$RANDOM } -function restart_thread_1() -{ - restart_replicas_loop -} - -function restart_thread_2() -{ - restart_replicas_loop -} export -f rename_thread_1 export -f rename_thread_2 -export -f restart_thread_1 -export -f restart_thread_2 +export -f restart_replicas_loop +export -f restart_replicas_loop TIMEOUT=10 clickhouse_client_loop_timeout $TIMEOUT rename_thread_1 2> /dev/null & clickhouse_client_loop_timeout $TIMEOUT rename_thread_2 2> /dev/null & -clickhouse_client_loop_timeout $TIMEOUT restart_thread_1 2> /dev/null & -clickhouse_client_loop_timeout $TIMEOUT restart_thread_2 2> /dev/null & +clickhouse_client_loop_timeout $TIMEOUT restart_replicas_loop 2> /dev/null & +clickhouse_client_loop_timeout $TIMEOUT restart_replicas_loop 2> /dev/null & wait From 8acb5de5b5ef3512ad1e0bc1fd2ccd663380c5d9 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 14:43:15 +0200 Subject: [PATCH 101/111] Fix build report black --- tests/ci/build_report_check.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index 5946c00cc8a..592e905bcb5 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -151,7 +151,11 @@ if __name__ == "__main__": some_builds_are_missing = len(build_reports_map) < len(reports_order) if some_builds_are_missing: - logging.info("Expected to get %s build results, got %s", len(reports_order), len(build_reports_map)) + logging.info( + "Expected to get %s build results, got %s", + len(reports_order), + len(build_reports_map), + ) else: logging.info("Got exactly %s builds", len(build_reports_map)) From e795b83148054935b8b27dacb69ba8209c852c86 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 28 Mar 2022 14:56:23 +0200 Subject: [PATCH 102/111] disable random settings in fast test --- docker/test/fasttest/run.sh | 1 + tests/clickhouse-test | 17 +++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index bd1e0292636..079d2872204 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -267,6 +267,7 @@ function run_tests local test_opts=( --hung-check --fast-tests-only + --no-random-settings --no-long --testname --shard diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 2c830e6ea40..82d4a0e6ade 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -468,9 +468,11 @@ class TestCase: return testcase_args - def add_random_settings(self, client_options): + def add_random_settings(self, args, client_options): if self.tags and 'no-random-settings' in self.tags: return client_options + if args.no_random_settings: + return client_options if len(self.base_url_params) == 0: os.environ['CLICKHOUSE_URL_PARAMS'] = '&'.join(self.random_settings) @@ -485,9 +487,11 @@ class TestCase: os.environ['CLICKHOUSE_URL_PARAMS'] = self.base_url_params os.environ['CLICKHOUSE_CLIENT_OPT'] = self.base_client_options - def add_info_about_settings(self, description): + def add_info_about_settings(self, args, description): if self.tags and 'no-random-settings' in self.tags: return description + if args.no_random_settings: + return description return description + "\n" + "Settings used in the test: " + "--" + " --".join(self.random_settings) + "\n" @@ -788,13 +792,13 @@ class TestCase: self.runs_count += 1 self.testcase_args = self.configure_testcase_args(args, self.case_file, suite.suite_tmp_path) - client_options = self.add_random_settings(client_options) + client_options = self.add_random_settings(args, client_options) proc, stdout, stderr, total_time = self.run_single_test(server_logs_level, client_options) result = self.process_result_impl(proc, stdout, stderr, total_time) result.check_if_need_retry(args, stdout, stderr, self.runs_count) if result.status == TestStatus.FAIL: - result.description = self.add_info_about_settings(result.description) + result.description = self.add_info_about_settings(args, result.description) return result except KeyboardInterrupt as e: raise e @@ -802,12 +806,12 @@ class TestCase: return TestResult(self.name, TestStatus.FAIL, FailureReason.INTERNAL_QUERY_FAIL, 0., - self.add_info_about_settings(self.get_description_from_exception_info(sys.exc_info()))) + self.add_info_about_settings(args, self.get_description_from_exception_info(sys.exc_info()))) except (ConnectionRefusedError, ConnectionResetError): return TestResult(self.name, TestStatus.FAIL, FailureReason.SERVER_DIED, 0., - self.add_info_about_settings(self.get_description_from_exception_info(sys.exc_info()))) + self.add_info_about_settings(args, self.get_description_from_exception_info(sys.exc_info()))) except: return TestResult(self.name, TestStatus.UNKNOWN, FailureReason.INTERNAL_ERROR, @@ -1501,6 +1505,7 @@ if __name__ == '__main__': parser.add_argument('--print-time', action='store_true', dest='print_time', help='Print test time') parser.add_argument('--check-zookeeper-session', action='store_true', help='Check ZooKeeper session uptime to determine if failed test should be retried') parser.add_argument('--s3-storage', action='store_true', default=False, help='Run tests over s3 storage') + parser.add_argument('--no-random-settings', action='store_true', default=False, help='Disable settings randomization') parser.add_argument('--run-by-hash-num', type=int, help='Run tests matching crc32(test_name) % run_by_hash_total == run_by_hash_num') parser.add_argument('--run-by-hash-total', type=int, help='Total test groups for crc32(test_name) % run_by_hash_total == run_by_hash_num') From 51ef88e5dd01f1e81082b0ce149504a104e09c6b Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 15:29:13 +0200 Subject: [PATCH 103/111] Fix 00385 --- .../00385_storage_file_and_clickhouse-local_app_long.sh | 4 ++-- tests/queries/0_stateless/01268_procfs_metrics.sh | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/00385_storage_file_and_clickhouse-local_app_long.sh b/tests/queries/0_stateless/00385_storage_file_and_clickhouse-local_app_long.sh index d77955a51bc..a78cdd445cb 100755 --- a/tests/queries/0_stateless/00385_storage_file_and_clickhouse-local_app_long.sh +++ b/tests/queries/0_stateless/00385_storage_file_and_clickhouse-local_app_long.sh @@ -19,11 +19,11 @@ function pack_unpack_compare() ${CLICKHOUSE_CLIENT} --query "CREATE TABLE buf_00385 ENGINE = Memory AS $1" local res_orig - res_orig=$(${CLICKHOUSE_CLIENT} --max_threads=1 --query "SELECT $TABLE_HASH FROM buf_00385") + res_orig=$(${CLICKHOUSE_CLIENT} --max_block_size=65505 --max_threads=1 --query "SELECT $TABLE_HASH FROM buf_00385") ${CLICKHOUSE_CLIENT} --max_threads=1 --query "CREATE TABLE buf_file ENGINE = File($3) AS SELECT * FROM buf_00385" local res_db_file - res_db_file=$(${CLICKHOUSE_CLIENT} --max_threads=1 --query "SELECT $TABLE_HASH FROM buf_file") + res_db_file=$(${CLICKHOUSE_CLIENT} --max_block_size=65505 --max_threads=1 --query "SELECT $TABLE_HASH FROM buf_file") ${CLICKHOUSE_CLIENT} --max_threads=1 --query "SELECT * FROM buf_00385 FORMAT $3" > "$buf_file" local res_ch_local1 diff --git a/tests/queries/0_stateless/01268_procfs_metrics.sh b/tests/queries/0_stateless/01268_procfs_metrics.sh index d5bd99724ca..acf282ffd67 100755 --- a/tests/queries/0_stateless/01268_procfs_metrics.sh +++ b/tests/queries/0_stateless/01268_procfs_metrics.sh @@ -41,4 +41,7 @@ timeout $TIMEOUT bash -c show_processes_func & wait +# otherwise it can be alive after test +$CLICKHOUSE_CLIENT -q "KILL QUERY WHERE query='SELECT * FROM numbers(600000000) FORMAT Null SETTINGS max_threads = 1'" 2> /dev/null 1> /dev/null + echo "Test OK" From f10930c9c863b7b7554d749377fc41c47e7bf46c Mon Sep 17 00:00:00 2001 From: tavplubix Date: Mon, 28 Mar 2022 16:33:01 +0300 Subject: [PATCH 104/111] Update clickhouse-test --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 2c830e6ea40..6720269fb21 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -374,7 +374,7 @@ class SettingsRandomizer: "output_format_parallel_formatting": lambda: random.randint(0, 1), "input_format_parallel_parsing": lambda: random.randint(0, 1), "min_chunk_bytes_for_parallel_parsing": lambda: max(1024, int(random.gauss(10 * 1024 * 1024, 5 * 1000 * 1000))), - "max_read_buffer_size": lambda: random.randint(1, 20) if random.random() < 0.1 else random.randint(500000, 1048576), + "max_read_buffer_size": lambda: random.randint(500000, 1048576), "prefer_localhost_replica": lambda: random.randint(0, 1), "max_block_size": lambda: random.randint(8000, 100000), "max_threads": lambda: random.randint(1, 64), From 9f2cba498cf778aa657c1dc09b02e73162c7108a Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 28 Mar 2022 15:34:32 +0200 Subject: [PATCH 105/111] Fix timeouts in 01514_parallel_formatting test --- tests/queries/0_stateless/01514_parallel_formatting.sql | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/queries/0_stateless/01514_parallel_formatting.sql b/tests/queries/0_stateless/01514_parallel_formatting.sql index 95a9e19aa1f..a2d50a4d7bb 100644 --- a/tests/queries/0_stateless/01514_parallel_formatting.sql +++ b/tests/queries/0_stateless/01514_parallel_formatting.sql @@ -1,5 +1,8 @@ drop table if exists tsv; set output_format_parallel_formatting=1; +set max_read_buffer_size=1048576; +set max_block_size=65505; + create table tsv(a int, b int default 7) engine File(TSV); insert into tsv(a) select number from numbers(10000000); From e75a0a594d4516c23f5348c1e99b93a94f4a61fd Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 15:39:02 +0200 Subject: [PATCH 106/111] Better fix for procfs_metrics --- tests/queries/0_stateless/01268_procfs_metrics.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01268_procfs_metrics.sh b/tests/queries/0_stateless/01268_procfs_metrics.sh index acf282ffd67..c1697edd632 100755 --- a/tests/queries/0_stateless/01268_procfs_metrics.sh +++ b/tests/queries/0_stateless/01268_procfs_metrics.sh @@ -42,6 +42,12 @@ timeout $TIMEOUT bash -c show_processes_func & wait # otherwise it can be alive after test -$CLICKHOUSE_CLIENT -q "KILL QUERY WHERE query='SELECT * FROM numbers(600000000) FORMAT Null SETTINGS max_threads = 1'" 2> /dev/null 1> /dev/null +query_alive=$($CLICKHOUSE_CLIENT --query "SELECT count() FROM system.processes WHERE query ILIKE 'SELECT * FROM numbers(600000000)%'") +while [[ $query_alive != 0 ]] +do + $CLICKHOUSE_CLIENT -q "KILL QUERY WHERE query ilike '%SELECT * FROM numbers(600000000)%'" 2> /dev/null 1> /dev/null + sleep 0.5 + query_alive=$($CLICKHOUSE_CLIENT --query "SELECT count() FROM system.processes WHERE query ILIKE 'SELECT * FROM numbers(600000000)%'") +done echo "Test OK" From b838a7dcb0300877c121654ceb6d6b2c8f53c567 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 15:53:22 +0200 Subject: [PATCH 107/111] Remove outdated links from CI --- CMakeLists.txt | 2 +- base/glibc-compatibility/CMakeLists.txt | 2 +- cmake/version.cmake | 2 +- docker/packager/packager | 2 +- docker/test/fuzzer/run-fuzzer.sh | 2 +- docker/test/keeper-jepsen/run.sh | 4 ++-- docker/test/performance-comparison/compare.sh | 6 +++--- .../test/performance-comparison/download.sh | 19 +++++-------------- .../test/performance-comparison/entrypoint.sh | 6 ++---- docker/test/stateful/Dockerfile | 2 +- docker/test/stateful/s3downloader | 2 +- docker/test/stress/Dockerfile | 2 +- 12 files changed, 20 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index deef582c790..5157f0f9903 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -261,7 +261,7 @@ endif () # Add a section with the hash of the compiled machine code for integrity checks. # Only for official builds, because adding a section can be time consuming (rewrite of several GB). # And cross compiled binaries are not supported (since you cannot execute clickhouse hash-binary) -if (OBJCOPY_PATH AND YANDEX_OFFICIAL_BUILD AND (NOT CMAKE_TOOLCHAIN_FILE)) +if (OBJCOPY_PATH AND CLICKHOUSE_OFFICIAL_BUILD AND (NOT CMAKE_TOOLCHAIN_FILE)) set (USE_BINARY_HASH 1) endif () diff --git a/base/glibc-compatibility/CMakeLists.txt b/base/glibc-compatibility/CMakeLists.txt index ddec09121e1..ef7ec6d7fc0 100644 --- a/base/glibc-compatibility/CMakeLists.txt +++ b/base/glibc-compatibility/CMakeLists.txt @@ -51,6 +51,6 @@ if (GLIBC_COMPATIBILITY) message (STATUS "Some symbols from glibc will be replaced for compatibility") -elseif (YANDEX_OFFICIAL_BUILD) +elseif (CLICKHOUSE_OFFICIAL_BUILD) message (WARNING "Option GLIBC_COMPATIBILITY must be turned on for production builds.") endif () diff --git a/cmake/version.cmake b/cmake/version.cmake index 963f291c0f3..acaa772ff2f 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -18,6 +18,6 @@ set (VERSION_STRING_SHORT "${VERSION_MAJOR}.${VERSION_MINOR}") math (EXPR VERSION_INTEGER "${VERSION_PATCH} + ${VERSION_MINOR}*1000 + ${VERSION_MAJOR}*1000000") -if(YANDEX_OFFICIAL_BUILD) +if(CLICKHOUSE_OFFICIAL_BUILD) set(VERSION_OFFICIAL " (official build)") endif() diff --git a/docker/packager/packager b/docker/packager/packager index a5763273f5f..c05e39b1d56 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -244,7 +244,7 @@ def parse_env_variables( result.append(f"AUTHOR='{author}'") if official: - cmake_flags.append("-DYANDEX_OFFICIAL_BUILD=1") + cmake_flags.append("-DCLICKHOUSE_OFFICIAL_BUILD=1") result.append('CMAKE_FLAGS="' + " ".join(cmake_flags) + '"') diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index e18c07bf2c1..74711f476f8 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -13,7 +13,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" echo "$script_dir" repo_dir=ch BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-13_debug_none_bundled_unsplitted_disable_False_binary"} -BINARY_URL_TO_DOWNLOAD=${BINARY_URL_TO_DOWNLOAD:="https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"} +BINARY_URL_TO_DOWNLOAD=${BINARY_URL_TO_DOWNLOAD:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"} function clone { diff --git a/docker/test/keeper-jepsen/run.sh b/docker/test/keeper-jepsen/run.sh index d7534270e2c..4dec82234bc 100644 --- a/docker/test/keeper-jepsen/run.sh +++ b/docker/test/keeper-jepsen/run.sh @@ -2,7 +2,7 @@ set -euo pipefail -CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-13_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"} +CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-13_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"} CLICKHOUSE_REPO_PATH=${CLICKHOUSE_REPO_PATH:=""} @@ -10,7 +10,7 @@ if [ -z "$CLICKHOUSE_REPO_PATH" ]; then CLICKHOUSE_REPO_PATH=ch rm -rf ch ||: mkdir ch ||: - wget -nv -nd -c "https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/repo/clickhouse_no_subs.tar.gz" + wget -nv -nd -c "https://clickhouse-test-reports.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/repo/clickhouse_no_subs.tar.gz" tar -C ch --strip-components=1 -xf clickhouse_no_subs.tar.gz ls -lath ||: fi diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh index 16ac304d7fb..54f71ce05bb 100755 --- a/docker/test/performance-comparison/compare.sh +++ b/docker/test/performance-comparison/compare.sh @@ -1294,15 +1294,15 @@ create table ci_checks engine File(TSVWithNamesAndTypes, 'ci-checks.tsv') select '' test_name, '$(sed -n 's/.*/\1/p' report.html)' test_status, 0 test_duration_ms, - 'https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/performance_comparison/report.html#fail1' report_url + 'https://clickhouse-test-reports.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/performance_comparison/report.html#fail1' report_url union all select test || ' #' || toString(query_index), 'slower' test_status, 0 test_duration_ms, - 'https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/performance_comparison/report.html#changes-in-performance.' + 'https://clickhouse-test-reports.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/performance_comparison/report.html#changes-in-performance.' || test || '.' || toString(query_index) report_url from queries where changed_fail != 0 and diff > 0 union all select test || ' #' || toString(query_index), 'unstable' test_status, 0 test_duration_ms, - 'https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/performance_comparison/report.html#unstable-queries.' + 'https://clickhouse-test-reports.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/performance_comparison/report.html#unstable-queries.' || test || '.' || toString(query_index) report_url from queries where unstable_fail != 0 ) diff --git a/docker/test/performance-comparison/download.sh b/docker/test/performance-comparison/download.sh index 8fa6eb5ec83..ae9e677713f 100755 --- a/docker/test/performance-comparison/download.sh +++ b/docker/test/performance-comparison/download.sh @@ -16,26 +16,17 @@ right_sha=$4 datasets=${CHPC_DATASETS-"hits1 hits10 hits100 values"} declare -A dataset_paths -if [[ $S3_URL == *"s3.amazonaws.com"* ]]; then - dataset_paths["hits10"]="https://clickhouse-private-datasets.s3.amazonaws.com/hits_10m_single/partitions/hits_10m_single.tar" - dataset_paths["hits100"]="https://clickhouse-private-datasets.s3.amazonaws.com/hits_100m_single/partitions/hits_100m_single.tar" - dataset_paths["hits1"]="https://clickhouse-datasets.s3.amazonaws.com/hits/partitions/hits_v1.tar" - dataset_paths["values"]="https://clickhouse-datasets.s3.amazonaws.com/values_with_expressions/partitions/test_values.tar" -else - dataset_paths["hits10"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar" - dataset_paths["hits100"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_100m_single/partitions/hits_100m_single.tar" - dataset_paths["hits1"]="https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar" - dataset_paths["values"]="https://clickhouse-datasets.s3.yandex.net/values_with_expressions/partitions/test_values.tar" -fi +dataset_paths["hits10"]="https://clickhouse-private-datasets.s3.amazonaws.com/hits_10m_single/partitions/hits_10m_single.tar" +dataset_paths["hits100"]="https://clickhouse-private-datasets.s3.amazonaws.com/hits_100m_single/partitions/hits_100m_single.tar" +dataset_paths["hits1"]="https://clickhouse-datasets.s3.amazonaws.com/hits/partitions/hits_v1.tar" +dataset_paths["values"]="https://clickhouse-datasets.s3.amazonaws.com/values_with_expressions/partitions/test_values.tar" function download { # Historically there were various paths for the performance test package. # Test all of them. - declare -a urls_to_try=("https://s3.amazonaws.com/clickhouse-builds/$left_pr/$left_sha/performance/performance.tgz" - "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/clickhouse_build_check/performance/performance.tgz" - ) + declare -a urls_to_try=("https://s3.amazonaws.com/clickhouse-builds/$left_pr/$left_sha/performance/performance.tgz") for path in "${urls_to_try[@]}" do diff --git a/docker/test/performance-comparison/entrypoint.sh b/docker/test/performance-comparison/entrypoint.sh index 3d37a6c0e92..767807d008b 100755 --- a/docker/test/performance-comparison/entrypoint.sh +++ b/docker/test/performance-comparison/entrypoint.sh @@ -4,7 +4,7 @@ set -ex CHPC_CHECK_START_TIMESTAMP="$(date +%s)" export CHPC_CHECK_START_TIMESTAMP -S3_URL=${S3_URL:="https://clickhouse-builds.s3.yandex.net"} +S3_URL=${S3_URL:="https://clickhouse-builds.s3.amazonaws.com"} COMMON_BUILD_PREFIX="/clickhouse_build_check" if [[ $S3_URL == *"s3.amazonaws.com"* ]]; then @@ -64,9 +64,7 @@ function find_reference_sha # Historically there were various path for the performance test package, # test all of them. unset found - declare -a urls_to_try=("https://s3.amazonaws.com/clickhouse-builds/0/$REF_SHA/performance/performance.tgz" - "https://clickhouse-builds.s3.yandex.net/0/$REF_SHA/clickhouse_build_check/performance/performance.tgz" - ) + declare -a urls_to_try=("https://s3.amazonaws.com/clickhouse-builds/0/$REF_SHA/performance/performance.tgz") for path in "${urls_to_try[@]}" do if curl_with_retry "$path" diff --git a/docker/test/stateful/Dockerfile b/docker/test/stateful/Dockerfile index 93e7cebb857..543cf113b2b 100644 --- a/docker/test/stateful/Dockerfile +++ b/docker/test/stateful/Dockerfile @@ -11,7 +11,7 @@ RUN apt-get update -y \ COPY s3downloader /s3downloader -ENV S3_URL="https://clickhouse-datasets.s3.yandex.net" +ENV S3_URL="https://clickhouse-datasets.s3.amazonaws.com" ENV DATASETS="hits visits" ENV EXPORT_S3_STORAGE_POLICIES=1 diff --git a/docker/test/stateful/s3downloader b/docker/test/stateful/s3downloader index eb3b3cd9faf..b1302877d6a 100755 --- a/docker/test/stateful/s3downloader +++ b/docker/test/stateful/s3downloader @@ -10,7 +10,7 @@ import requests import tempfile -DEFAULT_URL = 'https://clickhouse-datasets.s3.yandex.net' +DEFAULT_URL = 'https://clickhouse-datasets.s3.amazonaws.com' AVAILABLE_DATASETS = { 'hits': 'hits_v1.tar', diff --git a/docker/test/stress/Dockerfile b/docker/test/stress/Dockerfile index 495c12f4f83..ba6daffc014 100644 --- a/docker/test/stress/Dockerfile +++ b/docker/test/stress/Dockerfile @@ -29,7 +29,7 @@ COPY ./download_previous_release /download_previous_release COPY run.sh / ENV DATASETS="hits visits" -ENV S3_URL="https://clickhouse-datasets.s3.yandex.net" +ENV S3_URL="https://clickhouse-datasets.s3.amazonaws.com" ENV EXPORT_S3_STORAGE_POLICIES=1 CMD ["/bin/bash", "/run.sh"] From 1eef2d7c26f1ee7b8d9bf8f361b6f59b3ab6e660 Mon Sep 17 00:00:00 2001 From: tavplubix Date: Mon, 28 Mar 2022 16:56:25 +0300 Subject: [PATCH 108/111] Update 01532_primary_key_without_order_by_zookeeper.sql --- .../01532_primary_key_without_order_by_zookeeper.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.sql b/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.sql index d744e56c482..109c808b344 100644 --- a/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.sql +++ b/tests/queries/0_stateless/01532_primary_key_without_order_by_zookeeper.sql @@ -1,4 +1,4 @@ --- Tags: zookeeper +-- Tags: zookeeper, no-parallel DROP TABLE IF EXISTS merge_tree_pk; From 1f028dd48e41509617c6a97a3e43d83d50bb7d02 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 28 Mar 2022 14:13:56 +0000 Subject: [PATCH 109/111] Fix flaky tests 02149_read_in_order_fixed_prefix and 02177_issue_31009 --- .../queries/0_stateless/02149_read_in_order_fixed_prefix.sql | 2 ++ tests/queries/0_stateless/02177_issue_31009.sql | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql index 7d0e9111d9c..8fb11ac383c 100644 --- a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql +++ b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql @@ -1,3 +1,5 @@ +SET max_threads=0; + DROP TABLE IF EXISTS t_read_in_order; CREATE TABLE t_read_in_order(date Date, i UInt64, v UInt64) diff --git a/tests/queries/0_stateless/02177_issue_31009.sql b/tests/queries/0_stateless/02177_issue_31009.sql index f4a65e3a3a0..ab4aec60ce4 100644 --- a/tests/queries/0_stateless/02177_issue_31009.sql +++ b/tests/queries/0_stateless/02177_issue_31009.sql @@ -1,5 +1,10 @@ -- Tags: long +SET max_threads=0; + +DROP TABLE IF EXISTS left; +DROP TABLE IF EXISTS right; + CREATE TABLE left ( key UInt32, value String ) ENGINE = MergeTree ORDER BY key; CREATE TABLE right ( key UInt32, value String ) ENGINE = MergeTree ORDER BY tuple(); From 9c3e9a2c9b30ef8739616d54bb4ba759fc0f2bf0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 28 Mar 2022 17:11:20 +0200 Subject: [PATCH 110/111] Update tests/queries/0_stateless/01108_restart_replicas_rename_deadlock_zookeeper.sh Co-authored-by: Azat Khuzhin --- .../01108_restart_replicas_rename_deadlock_zookeeper.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/01108_restart_replicas_rename_deadlock_zookeeper.sh b/tests/queries/0_stateless/01108_restart_replicas_rename_deadlock_zookeeper.sh index 7b065e87f8a..a51e786b058 100755 --- a/tests/queries/0_stateless/01108_restart_replicas_rename_deadlock_zookeeper.sh +++ b/tests/queries/0_stateless/01108_restart_replicas_rename_deadlock_zookeeper.sh @@ -42,7 +42,6 @@ function restart_replicas_loop() export -f rename_thread_1 export -f rename_thread_2 export -f restart_replicas_loop -export -f restart_replicas_loop TIMEOUT=10 From ea7939103abc8ce25e0ed373de606a56f4252495 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 28 Mar 2022 18:43:45 +0200 Subject: [PATCH 111/111] Add debug information about event, rerun only failed jobs --- tests/ci/workflow_approve_rerun_lambda/app.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/ci/workflow_approve_rerun_lambda/app.py b/tests/ci/workflow_approve_rerun_lambda/app.py index 50b9d9bfedc..b650d1651fe 100644 --- a/tests/ci/workflow_approve_rerun_lambda/app.py +++ b/tests/ci/workflow_approve_rerun_lambda/app.py @@ -379,12 +379,16 @@ def check_need_to_rerun(workflow_description): def rerun_workflow(workflow_description, token): print("Going to rerun workflow") - _exec_post_with_retry(workflow_description.rerun_url, token) + try: + _exec_post_with_retry(f"{workflow_description.rerun_url}-failed-jobs", token) + except Exception: + _exec_post_with_retry(workflow_description.rerun_url, token) def main(event): token = get_token_from_aws() event_data = json.loads(event["body"]) + print("The body received:", event_data) workflow_description = get_workflow_description_from_event(event_data) print("Got workflow description", workflow_description)