From cb7be14492ed358416a35172ecc43741af605642 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 4 May 2022 20:16:42 +0300 Subject: [PATCH 01/69] FR: Expose what triggered the merge in system.part_log #26255 --- src/Interpreters/PartLog.cpp | 31 ++++++++++++++++ src/Interpreters/PartLog.h | 15 ++++++++ src/Storages/MergeTree/MergeTreeData.cpp | 4 ++ .../02293_part_log_has_merge_reason.reference | 1 + .../02293_part_log_has_merge_reason.sql | 37 +++++++++++++++++++ 5 files changed, 88 insertions(+) create mode 100644 tests/queries/0_stateless/02293_part_log_has_merge_reason.reference create mode 100644 tests/queries/0_stateless/02293_part_log_has_merge_reason.sql diff --git a/src/Interpreters/PartLog.cpp b/src/Interpreters/PartLog.cpp index ce9aa0c03d1..643fd192cad 100644 --- a/src/Interpreters/PartLog.cpp +++ b/src/Interpreters/PartLog.cpp @@ -16,6 +16,25 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +PartLogElement::MergeReasonType PartLogElement::getMergeReasonType(MergeType merge_type) { + switch (merge_type) + { + case MergeType::REGULAR: + return REGULAR_MERGE; + case MergeType::TTL_DELETE: + return TTL_DELETE_MERGE; + case MergeType::TTL_RECOMPRESS: + return TTL_RECOMPRESS_MERGE; + } + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unknown MergeType {}", static_cast(merge_type)); +} + NamesAndTypesList PartLogElement::getNamesAndTypes() { auto event_type_datatype = std::make_shared( @@ -30,11 +49,22 @@ NamesAndTypesList PartLogElement::getNamesAndTypes() } ); + auto merge_reason_datatype = std::make_shared( + DataTypeEnum8::Values + { + {"NotAMerge", static_cast(NOT_A_MERGE)}, + {"RegularMerge", static_cast(REGULAR_MERGE)}, + {"TTLDeleteMerge", static_cast(TTL_DELETE_MERGE)}, + {"TTLRecompressMerge", static_cast(TTL_RECOMPRESS_MERGE)}, + } + ); + ColumnsWithTypeAndName columns_with_type_and_name; return { {"query_id", std::make_shared()}, {"event_type", std::move(event_type_datatype)}, + {"merge_reason", std::move(merge_reason_datatype)}, {"event_date", std::make_shared()}, {"event_time", std::make_shared()}, @@ -71,6 +101,7 @@ void PartLogElement::appendToBlock(MutableColumns & columns) const columns[i++]->insert(query_id); columns[i++]->insert(event_type); + columns[i++]->insert(merge_reason); columns[i++]->insert(DateLUT::instance().toDayNum(event_time).toUnderType()); columns[i++]->insert(event_time); columns[i++]->insert(event_time_microseconds); diff --git a/src/Interpreters/PartLog.h b/src/Interpreters/PartLog.h index 7582f6fe9e6..48a54c55b1c 100644 --- a/src/Interpreters/PartLog.h +++ b/src/Interpreters/PartLog.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -20,9 +21,22 @@ struct PartLogElement MOVE_PART = 6, }; + enum MergeReasonType + { + /// merge_reason is relevant only for event_type = 'MERGE_PARTS', in other cases it is NOT_A_MERGE + NOT_A_MERGE = 1, + /// Just regular merge + REGULAR_MERGE = 2, + /// Merge assigned to delete some data from parts (with TTLMergeSelector) + TTL_DELETE_MERGE = 3, + /// Merge with recompression + TTL_RECOMPRESS_MERGE = 4, + }; + String query_id; Type event_type = NEW_PART; + MergeReasonType merge_reason = NOT_A_MERGE; time_t event_time = 0; Decimal64 event_time_microseconds = 0; @@ -54,6 +68,7 @@ struct PartLogElement static std::string name() { return "PartLog"; } + static MergeReasonType getMergeReasonType(MergeType merge_type); static NamesAndTypesList getNamesAndTypes(); static NamesAndAliases getNamesAndAliases() { return {}; } void appendToBlock(MutableColumns & columns) const; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index d84fb9d30d3..86787635b1b 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6104,6 +6104,10 @@ try part_log_elem.event_type = type; + if (part_log_elem.event_type == PartLogElement::MERGE_PARTS) + if (merge_entry) + part_log_elem.merge_reason = PartLogElement::getMergeReasonType((*merge_entry)->merge_type); + part_log_elem.error = static_cast(execution_status.code); part_log_elem.exception = execution_status.message; diff --git a/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference b/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql new file mode 100644 index 00000000000..db1f4c26af4 --- /dev/null +++ b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql @@ -0,0 +1,37 @@ +DROP TABLE IF EXISTS t_part_log_has_merge_type_table; + +CREATE TABLE t_part_log_has_merge_type_table +( + event_time DateTime, + UserID UInt64, + Comment String +) +ENGINE = MergeTree() +ORDER BY tuple() +TTL event_time + toIntervalMonth(3) +SETTINGS min_bytes_for_wide_part = 0, merge_with_ttl_timeout = 1; + +INSERT INTO t_part_log_has_merge_type_table VALUES (now(), 1, 'username1'); +INSERT INTO t_part_log_has_merge_type_table VALUES (now() - INTERVAL 4 MONTH, 2, 'username2'); + +OPTIMIZE TABLE t_part_log_has_merge_type_table FINAL; + +SYSTEM FLUSH LOGS; + +SELECT count(*) +FROM +( + SELECT + metadata_modification_time, + event_time + FROM system.tables AS l + INNER JOIN system.part_log AS r + ON l.name = r.table + WHERE (l.database = currentDatabase()) AND + (l.name = 't_part_log_has_merge_type_table') AND + (r.event_type = 'MergeParts') AND + (r.merge_reason = 'TTLDeleteMerge') +) +WHERE (metadata_modification_time <= event_time); + +DROP TABLE t_part_log_has_merge_type_table; From d6d249d964971bf17064a5fdee0d5b953e14a42a Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 4 May 2022 20:16:42 +0300 Subject: [PATCH 02/69] FR: Expose what triggered the merge in system.part_log #26255 --- docs/en/operations/system-tables/part_log.md | 6 +++++ src/Interpreters/PartLog.cpp | 9 ++++--- .../02293_part_log_has_merge_reason.reference | 2 +- .../02293_part_log_has_merge_reason.sql | 26 +++++++------------ 4 files changed, 21 insertions(+), 22 deletions(-) diff --git a/docs/en/operations/system-tables/part_log.md b/docs/en/operations/system-tables/part_log.md index 00eaca23862..1b567367c97 100644 --- a/docs/en/operations/system-tables/part_log.md +++ b/docs/en/operations/system-tables/part_log.md @@ -14,6 +14,11 @@ The `system.part_log` table contains the following columns: - `REMOVE_PART` — Removing or detaching a data part using [DETACH PARTITION](../../sql-reference/statements/alter/partition.md#alter_detach-partition). - `MUTATE_PART` — Mutating of a data part. - `MOVE_PART` — Moving the data part from the one disk to another one. +- `merge_reason` ([Enum8](../../sql-reference/data-types/enum.md)) — The reason for the event with type `MERGE_PARTS`. Can have one of the following values: + - `NOT_A_MERGE` — The current event has the type other than `MERGE_PARTS`. + - `REGULAR_MERGE` — Some regular merge. + - `TTL_DELETE_MERGE` — Cleaning up expired data. + - `TTL_RECOMPRESS_MERGE` — Recompressing data part with the. - `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date. - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time. - `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds precision. @@ -46,6 +51,7 @@ Row 1: ────── query_id: 983ad9c7-28d5-4ae1-844e-603116b7de31 event_type: NewPart +merge_reason: NotAMerge event_date: 2021-02-02 event_time: 2021-02-02 11:14:28 event_time_microseconds: 2021-02-02 11:14:28.861919 diff --git a/src/Interpreters/PartLog.cpp b/src/Interpreters/PartLog.cpp index 643fd192cad..4474c22d464 100644 --- a/src/Interpreters/PartLog.cpp +++ b/src/Interpreters/PartLog.cpp @@ -21,14 +21,15 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -PartLogElement::MergeReasonType PartLogElement::getMergeReasonType(MergeType merge_type) { +PartLogElement::MergeReasonType PartLogElement::getMergeReasonType(MergeType merge_type) +{ switch (merge_type) { - case MergeType::REGULAR: + case MergeType::Regular: return REGULAR_MERGE; - case MergeType::TTL_DELETE: + case MergeType::TTLDelete: return TTL_DELETE_MERGE; - case MergeType::TTL_RECOMPRESS: + case MergeType::TTLRecompress: return TTL_RECOMPRESS_MERGE; } diff --git a/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference b/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference index d00491fd7e5..220107cf15b 100644 --- a/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference +++ b/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference @@ -1 +1 @@ -1 +MergeParts TTLDeleteMerge diff --git a/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql index db1f4c26af4..7ef86354e71 100644 --- a/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql +++ b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql @@ -8,30 +8,22 @@ CREATE TABLE t_part_log_has_merge_type_table ) ENGINE = MergeTree() ORDER BY tuple() -TTL event_time + toIntervalMonth(3) -SETTINGS min_bytes_for_wide_part = 0, merge_with_ttl_timeout = 1; +SETTINGS min_bytes_for_wide_part = 0, materialize_ttl_recalculate_only = true; INSERT INTO t_part_log_has_merge_type_table VALUES (now(), 1, 'username1'); INSERT INTO t_part_log_has_merge_type_table VALUES (now() - INTERVAL 4 MONTH, 2, 'username2'); +ALTER TABLE t_part_log_has_merge_type_table + MODIFY TTL event_time + INTERVAL 3 MONTH; + OPTIMIZE TABLE t_part_log_has_merge_type_table FINAL; SYSTEM FLUSH LOGS; -SELECT count(*) -FROM -( - SELECT - metadata_modification_time, - event_time - FROM system.tables AS l - INNER JOIN system.part_log AS r - ON l.name = r.table - WHERE (l.database = currentDatabase()) AND - (l.name = 't_part_log_has_merge_type_table') AND - (r.event_type = 'MergeParts') AND - (r.merge_reason = 'TTLDeleteMerge') -) -WHERE (metadata_modification_time <= event_time); +SELECT + event_type, + merge_reason +FROM system.part_log +WHERE (table = 't_part_log_has_merge_type_table') AND (merge_reason = 'TTLDeleteMerge'); DROP TABLE t_part_log_has_merge_type_table; From e9187ec4b7938616957bbad06c4816fcd94d7777 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 23 May 2022 14:35:09 +0000 Subject: [PATCH 03/69] Overcommit: update defaults, exception message and add ProfileEvent --- programs/server/Server.cpp | 2 +- src/Common/MemoryTracker.cpp | 36 ++++++++++++++++--- src/Common/OvercommitTracker.cpp | 24 ++++++++++--- src/Common/OvercommitTracker.h | 12 ++++++- src/Common/ProfileEvents.cpp | 1 + src/Common/tests/gtest_overcommit_tracker.cpp | 20 +++++------ 6 files changed, 73 insertions(+), 22 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index defc66b0ed9..18ab96983eb 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1095,7 +1095,7 @@ int Server::main(const std::vector & /*args*/) total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking); auto * global_overcommit_tracker = global_context->getGlobalOvercommitTracker(); - UInt64 max_overcommit_wait_time = config->getUInt64("global_memory_usage_overcommit_max_wait_microseconds", 200); + UInt64 max_overcommit_wait_time = config->getUInt64("global_memory_usage_overcommit_max_wait_microseconds", 5'000'000); global_overcommit_tracker->setMaxWaitTime(max_overcommit_wait_time); total_memory_tracker.setOvercommitTracker(global_overcommit_tracker); diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index 0e7803aaa71..b5a27543b4e 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef MEMORY_TRACKER_DEBUG_CHECKS @@ -52,6 +53,30 @@ namespace DB } } +namespace +{ + +inline std::string_view toDescription(OvercommitResult result) +{ + switch (result) + { + case OvercommitResult::NONE: + return "Memory overcommit isn't used. OvercommitTracker isn't set."; + case OvercommitResult::DISABLED: + return "Memory overcommit isn't used. Waiting time or orvercommit denominator are set to zero."; + case OvercommitResult::MEMORY_FREED: + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "OvercommitResult::MEMORY_FREED shouldn't be asked for description"); + case OvercommitResult::SELECTED: + return "Query was selected to stop by OvercommitTracker."; + case OvercommitResult::TIMEOUTED: + return "Waiting timeout for memory to be freed is reached."; + case OvercommitResult::NOT_ENOUGH_FREED: + return "Memory overcommit has freed not enough memory."; + } +} + +} + namespace ProfileEvents { extern const Event QueryMemoryLimitExceeded; @@ -189,11 +214,11 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT if (unlikely(current_hard_limit && will_be > current_hard_limit) && memoryTrackerCanThrow(level, false) && throw_if_memory_exceeded) { - bool need_to_throw = true; + OvercommitResult overcommit_result = OvercommitResult::NONE; if (auto * overcommit_tracker_ptr = overcommit_tracker.load(std::memory_order_relaxed); overcommit_tracker_ptr != nullptr && query_tracker != nullptr) - need_to_throw = overcommit_tracker_ptr->needToStopQuery(query_tracker, size); + overcommit_result = overcommit_tracker_ptr->needToStopQuery(query_tracker, size); - if (need_to_throw) + if (overcommit_result != OvercommitResult::MEMORY_FREED) { /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global); @@ -201,12 +226,13 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT const auto * description = description_ptr.load(std::memory_order_relaxed); throw DB::Exception( DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED, - "Memory limit{}{} exceeded: would use {} (attempt to allocate chunk of {} bytes), maximum: {}", + "Memory limit{}{} exceeded: would use {} (attempt to allocate chunk of {} bytes), maximum: {}. OvercommitTracker decision: {}.", description ? " " : "", description ? description : "", formatReadableSizeWithBinarySuffix(will_be), size, - formatReadableSizeWithBinarySuffix(current_hard_limit)); + formatReadableSizeWithBinarySuffix(current_hard_limit), + toDescription(overcommit_result)); } else { diff --git a/src/Common/OvercommitTracker.cpp b/src/Common/OvercommitTracker.cpp index dbacc0d81a4..0c03ba58e87 100644 --- a/src/Common/OvercommitTracker.cpp +++ b/src/Common/OvercommitTracker.cpp @@ -2,8 +2,14 @@ #include #include +#include #include +namespace ProfileEvents +{ + extern const Event MemoryOvercommitWaitTimeMicroseconds; +} + using namespace std::chrono_literals; constexpr std::chrono::microseconds ZERO_MICROSEC = 0us; @@ -24,7 +30,7 @@ void OvercommitTracker::setMaxWaitTime(UInt64 wait_time) max_wait_time = wait_time * 1us; } -bool OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) +OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) { // NOTE: Do not change the order of locks // @@ -36,7 +42,7 @@ bool OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) std::unique_lock lk(overcommit_m); if (max_wait_time == ZERO_MICROSEC) - return true; + return OvercommitResult::DISABLED; pickQueryToExclude(); assert(cancellation_state != QueryCancellationState::NONE); @@ -50,7 +56,7 @@ bool OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) // picked_tracker to be not null pointer. assert(cancellation_state == QueryCancellationState::SELECTED); cancellation_state = QueryCancellationState::NONE; - return true; + return OvercommitResult::DISABLED; } if (picked_tracker == tracker) { @@ -58,17 +64,20 @@ bool OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) // It may happen even when current state is RUNNING, because // ThreadStatus::~ThreadStatus may call MemoryTracker::alloc. cancellation_state = QueryCancellationState::RUNNING; - return true; + return OvercommitResult::SELECTED; } allow_release = true; required_memory += amount; required_per_thread[tracker] = amount; + auto wait_start_time = std::chrono::system_clock::now(); bool timeout = !cv.wait_for(lk, max_wait_time, [this, tracker]() { return required_per_thread[tracker] == 0 || cancellation_state == QueryCancellationState::NONE; }); + auto wait_end_time = std::chrono::system_clock::now(); + ProfileEvents::increment(ProfileEvents::MemoryOvercommitWaitTimeMicroseconds, (wait_end_time - wait_start_time) / 1us); LOG_DEBUG(getLogger(), "Memory was{} freed within timeout", (timeout ? " not" : "")); required_memory -= amount; @@ -84,7 +93,12 @@ bool OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) // As we don't need to free memory, we can continue execution of the selected query. if (required_memory == 0 && cancellation_state == QueryCancellationState::SELECTED) reset(); - return timeout || still_need != 0; + if (timeout) + return OvercommitResult::TIMEOUTED; + if (still_need != 0) + return OvercommitResult::NOT_ENOUGH_FREED; + else + return OvercommitResult::MEMORY_FREED; } void OvercommitTracker::tryContinueQueryExecutionAfterFree(Int64 amount) diff --git a/src/Common/OvercommitTracker.h b/src/Common/OvercommitTracker.h index 37de75f4848..79ed36cd7fa 100644 --- a/src/Common/OvercommitTracker.h +++ b/src/Common/OvercommitTracker.h @@ -36,6 +36,16 @@ struct OvercommitRatio class MemoryTracker; +enum class OvercommitResult +{ + NONE, + DISABLED, + MEMORY_FREED, + SELECTED, + TIMEOUTED, + NOT_ENOUGH_FREED, +}; + enum class QueryCancellationState { NONE = 0, // Hard limit is not reached, there is no selected query to kill. @@ -54,7 +64,7 @@ struct OvercommitTracker : boost::noncopyable { void setMaxWaitTime(UInt64 wait_time); - bool needToStopQuery(MemoryTracker * tracker, Int64 amount); + OvercommitResult needToStopQuery(MemoryTracker * tracker, Int64 amount); void tryContinueQueryExecutionAfterFree(Int64 amount); diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 7f3b9788c1f..72fefc3e31c 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -192,6 +192,7 @@ M(RealTimeMicroseconds, "Total (wall clock) time spent in processing (queries and other tasks) threads (not that this is a sum).") \ M(UserTimeMicroseconds, "Total time spent in processing (queries and other tasks) threads executing CPU instructions in user space. This include time CPU pipeline was stalled due to cache misses, branch mispredictions, hyper-threading, etc.") \ M(SystemTimeMicroseconds, "Total time spent in processing (queries and other tasks) threads executing CPU instructions in OS kernel space. This include time CPU pipeline was stalled due to cache misses, branch mispredictions, hyper-threading, etc.") \ + M(MemoryOvercommitWaitTimeMicroseconds, "Total time spent in waiting for memory to be freed in OvercommitTracker.") \ M(SoftPageFaults, "") \ M(HardPageFaults, "") \ \ diff --git a/src/Common/tests/gtest_overcommit_tracker.cpp b/src/Common/tests/gtest_overcommit_tracker.cpp index 542af815842..c56ecec669f 100644 --- a/src/Common/tests/gtest_overcommit_tracker.cpp +++ b/src/Common/tests/gtest_overcommit_tracker.cpp @@ -56,7 +56,7 @@ void free_not_continue_test(T & overcommit_tracker) threads.push_back(std::thread( [&, i]() { - if (overcommit_tracker.needToStopQuery(&trackers[i], 100)) + if (overcommit_tracker.needToStopQuery(&trackers[i], 100) != OvercommitResult::MEMORY_FREED) ++need_to_stop; } )); @@ -112,7 +112,7 @@ void free_continue_test(T & overcommit_tracker) threads.push_back(std::thread( [&, i]() { - if (overcommit_tracker.needToStopQuery(&trackers[i], 100)) + if (overcommit_tracker.needToStopQuery(&trackers[i], 100) != OvercommitResult::MEMORY_FREED) ++need_to_stop; } )); @@ -168,7 +168,7 @@ void free_continue_and_alloc_test(T & overcommit_tracker) threads.push_back(std::thread( [&, i]() { - if (overcommit_tracker.needToStopQuery(&trackers[i], 100)) + if (overcommit_tracker.needToStopQuery(&trackers[i], 100) != OvercommitResult::MEMORY_FREED) ++need_to_stop; } )); @@ -181,7 +181,7 @@ void free_continue_and_alloc_test(T & overcommit_tracker) MemoryTracker failed; std::this_thread::sleep_for(1000ms); overcommit_tracker.tryContinueQueryExecutionAfterFree(5000); - stopped_next = overcommit_tracker.needToStopQuery(&failed, 100); + stopped_next = overcommit_tracker.needToStopQuery(&failed, 100) != OvercommitResult::MEMORY_FREED; } ).join(); @@ -228,7 +228,7 @@ void free_continue_and_alloc_2_test(T & overcommit_tracker) threads.push_back(std::thread( [&, i]() { - if (overcommit_tracker.needToStopQuery(&trackers[i], 100)) + if (overcommit_tracker.needToStopQuery(&trackers[i], 100) != OvercommitResult::MEMORY_FREED) ++need_to_stop; } )); @@ -241,7 +241,7 @@ void free_continue_and_alloc_2_test(T & overcommit_tracker) MemoryTracker failed; std::this_thread::sleep_for(1000ms); overcommit_tracker.tryContinueQueryExecutionAfterFree(5000); - stopped_next = overcommit_tracker.needToStopQuery(&failed, 100); + stopped_next = overcommit_tracker.needToStopQuery(&failed, 100) != OvercommitResult::MEMORY_FREED; } )); @@ -296,7 +296,7 @@ void free_continue_and_alloc_3_test(T & overcommit_tracker) threads.push_back(std::thread( [&, i]() { - if (overcommit_tracker.needToStopQuery(&trackers[i], 100)) + if (overcommit_tracker.needToStopQuery(&trackers[i], 100) != OvercommitResult::MEMORY_FREED) ++need_to_stop; } )); @@ -309,7 +309,7 @@ void free_continue_and_alloc_3_test(T & overcommit_tracker) MemoryTracker failed; std::this_thread::sleep_for(1000ms); overcommit_tracker.tryContinueQueryExecutionAfterFree(5000); - stopped_next = overcommit_tracker.needToStopQuery(&failed, 100); + stopped_next = overcommit_tracker.needToStopQuery(&failed, 100) != OvercommitResult::MEMORY_FREED; } )); @@ -364,7 +364,7 @@ void free_continue_2_test(T & overcommit_tracker) threads.push_back(std::thread( [&, i]() { - if (overcommit_tracker.needToStopQuery(&trackers[i], 100)) + if (overcommit_tracker.needToStopQuery(&trackers[i], 100) != OvercommitResult::MEMORY_FREED) ++need_to_stop; } )); @@ -415,7 +415,7 @@ void query_stop_not_continue_test(T & overcommit_tracker) auto thread = std::thread( [&]() { - if (overcommit_tracker.needToStopQuery(&another, 100)) + if (overcommit_tracker.needToStopQuery(&another, 100) != OvercommitResult::MEMORY_FREED) ++need_to_stop; } ); From ea60a614d2dd166906efaff26f67e25c571b8a31 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 25 May 2022 20:33:13 +0200 Subject: [PATCH 04/69] Decrease namespace indent --- src/Functions/Regexps.h | 446 ++++++++++++++++++++-------------------- 1 file changed, 224 insertions(+), 222 deletions(-) diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index dc94b75211c..2611afedc14 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -38,254 +38,256 @@ namespace ErrorCodes namespace Regexps { - using Regexp = OptimizedRegularExpressionSingleThreaded; - using Pool = ObjectPoolMap; +using Regexp = OptimizedRegularExpressionSingleThreaded; +using Pool = ObjectPoolMap; - template - inline Regexp createRegexp(const std::string & pattern, int flags) +template +inline Regexp createRegexp(const std::string & pattern, int flags) +{ + if constexpr (like) + return {likePatternToRegexp(pattern), flags}; + else + return {pattern, flags}; +} + +template +inline int buildRe2Flags() +{ + int flags = OptimizedRegularExpression::RE_DOT_NL; + if constexpr (no_capture) + flags |= OptimizedRegularExpression::RE_NO_CAPTURE; + if constexpr (case_insensitive) + flags |= OptimizedRegularExpression::RE_CASELESS; + return flags; +} + +/** Returns holder of an object from Pool. + * You must hold the ownership while using the object. + * In destructor, it returns the object back to the Pool for further reuse. + */ +template +inline Pool::Pointer get(const std::string & pattern) +{ + /// the Singleton is thread-safe in C++11 + static Pool known_regexps; /// Different variables for different pattern parameters. + + return known_regexps.get(pattern, [&pattern] { - if constexpr (like) - return {likePatternToRegexp(pattern), flags}; - else - return {pattern, flags}; - } + const int flags = buildRe2Flags(); + ProfileEvents::increment(ProfileEvents::RegexpCreated); + return new Regexp{createRegexp(pattern, flags)}; + }); +} - template - inline int buildRe2Flags() - { - int flags = OptimizedRegularExpression::RE_DOT_NL; - if constexpr (no_capture) - flags |= OptimizedRegularExpression::RE_NO_CAPTURE; - if constexpr (case_insensitive) - flags |= OptimizedRegularExpression::RE_CASELESS; - return flags; - } - - /** Returns holder of an object from Pool. - * You must hold the ownership while using the object. - * In destructor, it returns the object back to the Pool for further reuse. - */ - template - inline Pool::Pointer get(const std::string & pattern) - { - /// the Singleton is thread-safe in C++11 - static Pool known_regexps; /// Different variables for different pattern parameters. - - return known_regexps.get(pattern, [&pattern] - { - const int flags = buildRe2Flags(); - ProfileEvents::increment(ProfileEvents::RegexpCreated); - return new Regexp{createRegexp(pattern, flags)}; - }); - } } #if USE_HYPERSCAN namespace MultiRegexps { - template - struct HyperscanDeleter +template +struct HyperscanDeleter +{ + template + void operator()(T * ptr) const { - template - void operator()(T * ptr) const - { - deleter(ptr); - } - }; + deleter(ptr); + } +}; - /// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception. - using CompilerError = std::unique_ptr>; - using ScratchPtr = std::unique_ptr>; - using DataBasePtr = std::unique_ptr>; +/// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception. +using CompilerError = std::unique_ptr>; +using ScratchPtr = std::unique_ptr>; +using DataBasePtr = std::unique_ptr>; - /// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher. - class Regexps +/// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher. +class Regexps +{ +public: + Regexps(hs_database_t * db_, hs_scratch_t * scratch_) : db{db_}, scratch{scratch_} { } + + hs_database_t * getDB() const { return db.get(); } + hs_scratch_t * getScratch() const { return scratch.get(); } + +private: + DataBasePtr db; + ScratchPtr scratch; +}; + +class RegexpsConstructor +{ +public: + RegexpsConstructor() = default; + + void setConstructor(std::function constructor_) { constructor = std::move(constructor_); } + + Regexps * operator()() { - public: - Regexps(hs_database_t * db_, hs_scratch_t * scratch_) : db{db_}, scratch{scratch_} { } - - hs_database_t * getDB() const { return db.get(); } - hs_scratch_t * getScratch() const { return scratch.get(); } - - private: - DataBasePtr db; - ScratchPtr scratch; - }; - - class RegexpsConstructor - { - public: - RegexpsConstructor() = default; - - void setConstructor(std::function constructor_) { constructor = std::move(constructor_); } - - Regexps * operator()() - { - std::unique_lock lock(mutex); - if (regexp) - return &*regexp; - regexp = constructor(); + std::unique_lock lock(mutex); + if (regexp) return &*regexp; - } + regexp = constructor(); + return &*regexp; + } - private: - std::function constructor; - std::optional regexp; - std::mutex mutex; - }; +private: + std::function constructor; + std::optional regexp; + std::mutex mutex; +}; - struct Pool +struct Pool +{ + /// Mutex for finding in map. + std::mutex mutex; + /// Patterns + possible edit_distance to database and scratch. + std::map, std::optional>, RegexpsConstructor> storage; +}; + +template +inline Regexps constructRegexps(const std::vector & str_patterns, std::optional edit_distance) +{ + (void)edit_distance; + /// Common pointers + std::vector patterns; + std::vector flags; + + /// Pointer for external edit distance compilation + std::vector ext_exprs; + std::vector ext_exprs_ptrs; + + patterns.reserve(str_patterns.size()); + flags.reserve(str_patterns.size()); + + if constexpr (CompileForEditDistance) { - /// Mutex for finding in map. - std::mutex mutex; - /// Patterns + possible edit_distance to database and scratch. - std::map, std::optional>, RegexpsConstructor> storage; - }; + ext_exprs.reserve(str_patterns.size()); + ext_exprs_ptrs.reserve(str_patterns.size()); + } - template - inline Regexps constructRegexps(const std::vector & str_patterns, std::optional edit_distance) + for (const StringRef ref : str_patterns) { - (void)edit_distance; - /// Common pointers - std::vector patterns; - std::vector flags; - - /// Pointer for external edit distance compilation - std::vector ext_exprs; - std::vector ext_exprs_ptrs; - - patterns.reserve(str_patterns.size()); - flags.reserve(str_patterns.size()); - + patterns.push_back(ref.data); + /* Flags below are the pattern matching flags. + * HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good + * performance practice according to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode + * HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match. + * HS_FLAG_UTF8 is a flag where UTF8 literals are matched. + * HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice + * as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag + */ + flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8); if constexpr (CompileForEditDistance) { - ext_exprs.reserve(str_patterns.size()); - ext_exprs_ptrs.reserve(str_patterns.size()); + /// Hyperscan currently does not support UTF8 matching with edit distance. + flags.back() &= ~HS_FLAG_UTF8; + ext_exprs.emplace_back(); + /// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance. + ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE; + ext_exprs.back().edit_distance = edit_distance.value(); + ext_exprs_ptrs.push_back(&ext_exprs.back()); } - - for (const StringRef ref : str_patterns) - { - patterns.push_back(ref.data); - /* Flags below are the pattern matching flags. - * HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good - * performance practice according to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode - * HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match. - * HS_FLAG_UTF8 is a flag where UTF8 literals are matched. - * HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice - * as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag - */ - flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8); - if constexpr (CompileForEditDistance) - { - /// Hyperscan currently does not support UTF8 matching with edit distance. - flags.back() &= ~HS_FLAG_UTF8; - ext_exprs.emplace_back(); - /// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance. - ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE; - ext_exprs.back().edit_distance = edit_distance.value(); - ext_exprs_ptrs.push_back(&ext_exprs.back()); - } - } - hs_database_t * db = nullptr; - hs_compile_error_t * compile_error; - - std::unique_ptr ids; - - /// We mark the patterns to provide the callback results. - if constexpr (save_indices) - { - ids.reset(new unsigned int[patterns.size()]); - for (size_t i = 0; i < patterns.size(); ++i) - ids[i] = i + 1; - } - - hs_error_t err; - if constexpr (!CompileForEditDistance) - err = hs_compile_multi( - patterns.data(), - flags.data(), - ids.get(), - patterns.size(), - HS_MODE_BLOCK, - nullptr, - &db, - &compile_error); - else - err = hs_compile_ext_multi( - patterns.data(), - flags.data(), - ids.get(), - ext_exprs_ptrs.data(), - patterns.size(), - HS_MODE_BLOCK, - nullptr, - &db, - &compile_error); - - if (err != HS_SUCCESS) - { - /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown. - CompilerError error(compile_error); - - if (error->expression < 0) - throw Exception(String(error->message), ErrorCodes::LOGICAL_ERROR); - else - throw Exception( - "Pattern '" + str_patterns[error->expression] + "' failed with error '" + String(error->message), - ErrorCodes::BAD_ARGUMENTS); - } - - ProfileEvents::increment(ProfileEvents::RegexpCreated); - - /// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch - /// function which is faster than allocating scratch space each time in each thread. - hs_scratch_t * scratch = nullptr; - err = hs_alloc_scratch(db, &scratch); - - /// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch. - if (err != HS_SUCCESS) - throw Exception("Could not allocate scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY); - - return Regexps{db, scratch}; } + hs_database_t * db = nullptr; + hs_compile_error_t * compile_error; - /// If CompileForEditDistance is False, edit_distance must be nullopt - /// Also, we use templates here because each instantiation of function - /// template has its own copy of local static variables which must not be the same - /// for different hyperscan compilations. - template - inline Regexps * get(const std::vector & patterns, std::optional edit_distance) + std::unique_ptr ids; + + /// We mark the patterns to provide the callback results. + if constexpr (save_indices) { - /// C++11 has thread-safe function-local static on most modern compilers. - static Pool known_regexps; /// Different variables for different pattern parameters. - - std::vector str_patterns; - str_patterns.reserve(patterns.size()); - for (const StringRef & ref : patterns) - str_patterns.push_back(ref.toString()); - - /// Get the lock for finding database. - std::unique_lock lock(known_regexps.mutex); - - auto it = known_regexps.storage.find({str_patterns, edit_distance}); - - /// If not found, compile and let other threads wait. - if (known_regexps.storage.end() == it) - { - it = known_regexps.storage - .emplace(std::piecewise_construct, std::make_tuple(std::move(str_patterns), edit_distance), std::make_tuple()) - .first; - it->second.setConstructor([&str_patterns = it->first.first, edit_distance]() - { - return constructRegexps(str_patterns, edit_distance); - }); - } - - /// Unlock before possible construction. - lock.unlock(); - return it->second(); + ids.reset(new unsigned int[patterns.size()]); + for (size_t i = 0; i < patterns.size(); ++i) + ids[i] = i + 1; } + + hs_error_t err; + if constexpr (!CompileForEditDistance) + err = hs_compile_multi( + patterns.data(), + flags.data(), + ids.get(), + patterns.size(), + HS_MODE_BLOCK, + nullptr, + &db, + &compile_error); + else + err = hs_compile_ext_multi( + patterns.data(), + flags.data(), + ids.get(), + ext_exprs_ptrs.data(), + patterns.size(), + HS_MODE_BLOCK, + nullptr, + &db, + &compile_error); + + if (err != HS_SUCCESS) + { + /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown. + CompilerError error(compile_error); + + if (error->expression < 0) + throw Exception(String(error->message), ErrorCodes::LOGICAL_ERROR); + else + throw Exception( + "Pattern '" + str_patterns[error->expression] + "' failed with error '" + String(error->message), + ErrorCodes::BAD_ARGUMENTS); + } + + ProfileEvents::increment(ProfileEvents::RegexpCreated); + + /// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch + /// function which is faster than allocating scratch space each time in each thread. + hs_scratch_t * scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + + /// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch. + if (err != HS_SUCCESS) + throw Exception("Could not allocate scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY); + + return Regexps{db, scratch}; +} + +/// If CompileForEditDistance is False, edit_distance must be nullopt +/// Also, we use templates here because each instantiation of function +/// template has its own copy of local static variables which must not be the same +/// for different hyperscan compilations. +template +inline Regexps * get(const std::vector & patterns, std::optional edit_distance) +{ + /// C++11 has thread-safe function-local static on most modern compilers. + static Pool known_regexps; /// Different variables for different pattern parameters. + + std::vector str_patterns; + str_patterns.reserve(patterns.size()); + for (const StringRef & ref : patterns) + str_patterns.push_back(ref.toString()); + + /// Get the lock for finding database. + std::unique_lock lock(known_regexps.mutex); + + auto it = known_regexps.storage.find({str_patterns, edit_distance}); + + /// If not found, compile and let other threads wait. + if (known_regexps.storage.end() == it) + { + it = known_regexps.storage + .emplace(std::piecewise_construct, std::make_tuple(std::move(str_patterns), edit_distance), std::make_tuple()) + .first; + it->second.setConstructor([&str_patterns = it->first.first, edit_distance]() + { + return constructRegexps(str_patterns, edit_distance); + }); + } + + /// Unlock before possible construction. + lock.unlock(); + return it->second(); +} + } #endif // USE_HYPERSCAN From 49934a3dc865cc8131d94de4592d3bd4f21150c0 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 25 May 2022 21:22:45 +0200 Subject: [PATCH 05/69] Cache compiled regexps when evaluating non-const needles Needles in a (non-const) needle column may repeat and this commit allows to skip compilation for known needles. Out of the different design alternatives (see below, if someone is interested), we now maintain - one global pattern cache, - with a fixed size of 42k elements currently, - and use LRU as eviction strategy. ------------------------------------------------------------------------ (sorry for the wall of text, dumping it here not for reading but just for reference) Write-up about considered design alternatives: 1. Keep the current global cache of const needles. For non-const needles, probe the cache but don't store values in it. Pros: need to maintain just a single cache, no problem with cache pollution assuming there are few distinct constant needles Cons: only useful if a non-const needle occurred as already as a const needle --> overall too simplistic 2. Keep the current global cache for const needles. For non-const needles, create a local (e.g. per-query) cache Pros: unlike (1.), non-const needles can be skipped even if they did not occur yet, no pollution of the const pattern cache when there are very many non-const needles (e.g. large / highly distinct needle columns). Cons: caches may explode "horizontally", i.e. we'll end up with the const cache + caches for Q1, Q2, ... QN, this makes it harder to control the overall space consumption, also patterns residing in different caches cannot be reused between queries, another difficulty is that the concept of "query" does not really exist at matching level - there are only column chunks and we'd potentially end up with 1 cache / chunk 3. Queries with const and non-const needles insert into the same global cache. Pros: the advantages of (2.) + allows to reuse compiled patterns accross parallel queries Cons: needs an eviction strategy to control cache size and pollution (and btw. (2.) also needs eviction strategies for the individual caches) 4. Queries with const needle use global cache, queries with non-const needle use a different global cache --> Overall similar to (3) but ignores the (likely) edge case that const and non-const needles overlap. In sum, (3.) seems the simplest and most beneficial approach. Eviction strategies: 0. Don't ever evict --> cache may grow infinitely and eventually make the system unusable (may even pose a DoS risk) 1. Flush the cache after a certain threshold is exceeded --> very simple but may lead to peridic performance drops 2. Use LRU --> more graceful performance degradation at threshold but comes with a (constant) performance overhead to maintain the LRU queue In sum, given that the pattern compilation in RE2 should be quite costly (pattern-to-DFA/NFA), LRU may be acceptable. --- src/Functions/FunctionsStringArray.h | 4 ++-- src/Functions/MatchImpl.h | 30 ++++++++++--------------- src/Functions/Regexps.h | 33 +++++++++++----------------- src/Functions/countMatches.h | 4 ++-- 4 files changed, 29 insertions(+), 42 deletions(-) diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 2680816670f..6545c3e3549 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -448,7 +448,7 @@ public: class SplitByRegexpImpl { private: - Regexps::Pool::Pointer re; + Regexps::RegexpPtr re; OptimizedRegularExpression::MatchVec matches; Pos pos; @@ -532,7 +532,7 @@ public: class ExtractAllImpl { private: - Regexps::Pool::Pointer re; + Regexps::RegexpPtr re; OptimizedRegularExpression::MatchVec matches; size_t capture; diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index 17bda74f8ab..9779eb8d608 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -166,7 +166,7 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + auto regexp = Regexps::get(needle); String required_substring; bool is_trivial; @@ -325,7 +325,7 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + auto regexp = Regexps::get(needle); String required_substring; bool is_trivial; @@ -479,22 +479,19 @@ struct MatchImpl } else { - // each row is expected to contain a different like/re2 pattern - // --> bypass the regexp cache, instead construct the pattern on-the-fly - const int flags = Regexps::buildRe2Flags(); - const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); + auto regexp = Regexps::get(needle); - regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); + regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); if (required_substr.empty()) { - if (!regexp.getRE2()) /// An empty regexp. Always matches. + if (!regexp->getRE2()) /// An empty regexp. Always matches. { res[i] = !negate; } else { - const bool match = regexp.getRE2()->Match( + const bool match = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, 0, cur_haystack_length, @@ -524,7 +521,7 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0; const size_t end_pos = cur_haystack_length; - const bool match2 = regexp.getRE2()->Match( + const bool match2 = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, start_pos, end_pos, @@ -593,22 +590,19 @@ struct MatchImpl } else { - // each row is expected to contain a different like/re2 pattern - // --> bypass the regexp cache, instead construct the pattern on-the-fly - const int flags = Regexps::buildRe2Flags(); - const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); + auto regexp = Regexps::get(needle); - regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); + regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); if (required_substr.empty()) { - if (!regexp.getRE2()) /// An empty regexp. Always matches. + if (!regexp->getRE2()) /// An empty regexp. Always matches. { res[i] = !negate; } else { - const bool match = regexp.getRE2()->Match( + const bool match = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, 0, cur_haystack_length, @@ -638,7 +632,7 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0; const size_t end_pos = cur_haystack_length; - const bool match2 = regexp.getRE2()->Match( + const bool match2 = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, start_pos, end_pos, diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index 2611afedc14..be3ce6cdeee 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -39,16 +39,8 @@ namespace ErrorCodes namespace Regexps { using Regexp = OptimizedRegularExpressionSingleThreaded; -using Pool = ObjectPoolMap; - -template -inline Regexp createRegexp(const std::string & pattern, int flags) -{ - if constexpr (like) - return {likePatternToRegexp(pattern), flags}; - else - return {pattern, flags}; -} +using Cache = LRUCache; +using RegexpPtr = Cache::MappedPtr; template inline int buildRe2Flags() @@ -61,22 +53,23 @@ inline int buildRe2Flags() return flags; } -/** Returns holder of an object from Pool. - * You must hold the ownership while using the object. - * In destructor, it returns the object back to the Pool for further reuse. - */ +/// Probes the cache of known compiled regexps for the given string pattern and returns a compiled regexp if +/// found. Otherwise, a new cache entry is created. template -inline Pool::Pointer get(const std::string & pattern) +inline RegexpPtr get(const String & pattern) { - /// the Singleton is thread-safe in C++11 - static Pool known_regexps; /// Different variables for different pattern parameters. + static Cache known_regexps(42'000); - return known_regexps.get(pattern, [&pattern] + auto [regexp_ptr, _] = known_regexps.getOrSet(pattern, [&pattern]() { const int flags = buildRe2Flags(); ProfileEvents::increment(ProfileEvents::RegexpCreated); - return new Regexp{createRegexp(pattern, flags)}; + if constexpr (like) + return std::make_shared(likePatternToRegexp(pattern), flags); + else + return std::make_shared(pattern, flags); }); + return regexp_ptr; } } diff --git a/src/Functions/countMatches.h b/src/Functions/countMatches.h index 6d60ca94c18..1d43b66d867 100644 --- a/src/Functions/countMatches.h +++ b/src/Functions/countMatches.h @@ -55,7 +55,7 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); - Regexps::Pool::Pointer re = Regexps::get(column_pattern->getValue()); + Regexps::RegexpPtr re = Regexps::get(column_pattern->getValue()); OptimizedRegularExpression::MatchVec matches; const IColumn * column_haystack = arguments[0].column.get(); @@ -95,7 +95,7 @@ public: throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()"); } - static uint64_t countMatches(StringRef src, Regexps::Pool::Pointer & re, OptimizedRegularExpression::MatchVec & matches) + static uint64_t countMatches(StringRef src, Regexps::RegexpPtr & re, OptimizedRegularExpression::MatchVec & matches) { /// Only one match is required, no need to copy more. static const unsigned matches_limit = 1; From 25884c68f15967ec5dcd7021db5d795647e0e3a4 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Thu, 26 May 2022 20:46:26 -0400 Subject: [PATCH 06/69] http named collection source implemented for dictionary --- src/Dictionaries/HTTPDictionarySource.cpp | 71 +++++++++++++------ .../ExternalDataSourceConfiguration.cpp | 57 +++++++++++++++ .../ExternalDataSourceConfiguration.h | 6 ++ 3 files changed, 111 insertions(+), 23 deletions(-) diff --git a/src/Dictionaries/HTTPDictionarySource.cpp b/src/Dictionaries/HTTPDictionarySource.cpp index cf8b60f3681..8f7ca5e7a51 100644 --- a/src/Dictionaries/HTTPDictionarySource.cpp +++ b/src/Dictionaries/HTTPDictionarySource.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include "DictionarySourceFactory.h" @@ -228,45 +229,69 @@ void registerDictionarySourceHTTP(DictionarySourceFactory & factory) if (dict_struct.has_expressions) throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary source of type `http` does not support attribute expressions"); - auto context = copyContextAndApplySettingsFromDictionaryConfig(global_context, config, config_prefix); - - const auto & settings_config_prefix = config_prefix + ".http"; - const auto & credentials_prefix = settings_config_prefix + ".credentials"; - + auto settings_config_prefix = config_prefix + ".http"; Poco::Net::HTTPBasicCredentials credentials; - - if (config.has(credentials_prefix)) - { - credentials.setUsername(config.getString(credentials_prefix + ".user", "")); - credentials.setPassword(config.getString(credentials_prefix + ".password", "")); - } - - const auto & headers_prefix = settings_config_prefix + ".headers"; ReadWriteBufferFromHTTP::HTTPHeaderEntries header_entries; + String url; + String format; - if (config.has(headers_prefix)) + auto named_collection = created_from_ddl + ? getURLBasedDataSourceConfiguration(config, settings_config_prefix, global_context) + : std::nullopt; + if (named_collection) { - Poco::Util::AbstractConfiguration::Keys config_keys; - config.keys(headers_prefix, config_keys); + url = named_collection->configuration.url; + format = named_collection->configuration.format; - header_entries.reserve(config_keys.size()); - for (const auto & key : config_keys) + credentials.setUsername(named_collection->configuration.user); + credentials.setPassword(named_collection->configuration.password); + + header_entries.reserve(named_collection->configuration.headers.size()); + for (const auto & header : named_collection->configuration.headers) + header_entries.emplace_back(std::make_tuple(header.first, header.second.get())); + } + else + { + const auto & credentials_prefix = settings_config_prefix + ".credentials"; + + if (config.has(credentials_prefix)) { - const auto header_key = config.getString(headers_prefix + "." + key + ".name", ""); - const auto header_value = config.getString(headers_prefix + "." + key + ".value", ""); - header_entries.emplace_back(std::make_tuple(header_key, header_value)); + credentials.setUsername(config.getString(credentials_prefix + ".user", "")); + credentials.setPassword(config.getString(credentials_prefix + ".password", "")); } + + const auto & headers_prefix = settings_config_prefix + ".headers"; + + + if (config.has(headers_prefix)) + { + Poco::Util::AbstractConfiguration::Keys config_keys; + config.keys(headers_prefix, config_keys); + + header_entries.reserve(config_keys.size()); + for (const auto & key : config_keys) + { + const auto header_key = config.getString(headers_prefix + "." + key + ".name", ""); + const auto header_value = config.getString(headers_prefix + "." + key + ".value", ""); + header_entries.emplace_back(std::make_tuple(header_key, header_value)); + } + } + + url = config.getString(settings_config_prefix + ".url", ""); + format =config.getString(settings_config_prefix + ".format", ""); } auto configuration = HTTPDictionarySource::Configuration { - .url = config.getString(settings_config_prefix + ".url", ""), - .format =config.getString(settings_config_prefix + ".format", ""), + .url = url, + .format = format, .update_field = config.getString(settings_config_prefix + ".update_field", ""), .update_lag = config.getUInt64(settings_config_prefix + ".update_lag", 1), .header_entries = std::move(header_entries) //-V1030 }; + auto context = copyContextAndApplySettingsFromDictionaryConfig(global_context, config, config_prefix); + return std::make_unique(dict_struct, configuration, credentials, sample_block, context, created_from_ddl); }; factory.registerSource("http", create_table_source); diff --git a/src/Storages/ExternalDataSourceConfiguration.cpp b/src/Storages/ExternalDataSourceConfiguration.cpp index abd20e6e5fd..55eff117d5e 100644 --- a/src/Storages/ExternalDataSourceConfiguration.cpp +++ b/src/Storages/ExternalDataSourceConfiguration.cpp @@ -248,6 +248,63 @@ std::optional getExternalDataSourceConfiguration( return std::nullopt; } +std::optional getURLBasedDataSourceConfiguration( + const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context) +{ + URLBasedDataSourceConfiguration configuration; + auto collection_name = dict_config.getString(dict_config_prefix + ".name", ""); + if (!collection_name.empty()) + { + const auto & config = context->getConfigRef(); + const auto & collection_prefix = fmt::format("named_collections.{}", collection_name); + + if (!config.has(collection_prefix)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no collection named `{}` in config", collection_name); + + configuration.url = + dict_config.getString(dict_config_prefix + ".url", config.getString(collection_prefix + ".url", "")); + configuration.format = + dict_config.getString(dict_config_prefix + ".format", config.getString(collection_prefix + ".format", "")); + configuration.compression_method = + dict_config.getString(dict_config_prefix + ".compression", config.getString(collection_prefix + ".compression_method", "")); + configuration.structure = + dict_config.getString(dict_config_prefix + ".structure", config.getString(collection_prefix + ".structure", "")); + configuration.user = + dict_config.getString(dict_config_prefix + ".credentials.user", config.getString(collection_prefix + ".credentials.user", "")); + configuration.password = + dict_config.getString(dict_config_prefix + ".credentials.password", config.getString(collection_prefix + ".credentials.password", "")); + + String headers_prefix; + const Poco::Util::AbstractConfiguration *headers_config = nullptr; + if (dict_config.has(dict_config_prefix + ".headers")) + { + headers_prefix = dict_config_prefix + ".headers"; + headers_config = &dict_config; + } + else + { + headers_prefix = collection_prefix + ".headers"; + headers_config = &config; + } + + if (headers_config) + { + Poco::Util::AbstractConfiguration::Keys header_keys; + headers_config->keys(headers_prefix, header_keys); + headers_prefix += "."; + for (const auto & header : header_keys) + { + const auto header_prefix = headers_prefix + header; + configuration.headers.emplace_back( + std::make_pair(headers_config->getString(header_prefix + ".name"), headers_config->getString(header_prefix + ".value"))); + } + } + + return URLBasedDataSourceConfig{ .configuration = configuration }; + } + + return std::nullopt; +} ExternalDataSourcesByPriority getExternalDataSourceConfigurationByPriority( const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context, HasConfigKeyFunc has_config_key) diff --git a/src/Storages/ExternalDataSourceConfiguration.h b/src/Storages/ExternalDataSourceConfiguration.h index dfac101e22d..19301c360f0 100644 --- a/src/Storages/ExternalDataSourceConfiguration.h +++ b/src/Storages/ExternalDataSourceConfiguration.h @@ -103,6 +103,9 @@ struct URLBasedDataSourceConfiguration String compression_method = "auto"; String structure = "auto"; + String user; + String password; + std::vector> headers; String http_method; @@ -129,6 +132,9 @@ struct URLBasedDataSourceConfig std::optional getURLBasedDataSourceConfiguration(const ASTs & args, ContextPtr context); +std::optional getURLBasedDataSourceConfiguration( + const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context); + template bool getExternalDataSourceConfiguration(const ASTs & args, BaseSettings & settings, ContextPtr context); From 54d6f981222819ea9c915c151bf711ae8c07a1ba Mon Sep 17 00:00:00 2001 From: Vxider Date: Fri, 27 May 2022 04:50:36 +0000 Subject: [PATCH 07/69] flush and shutdown temporary table before drop --- src/Interpreters/DatabaseCatalog.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 267564eb84c..8e41887ca44 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -119,7 +119,11 @@ TemporaryTableHolder & TemporaryTableHolder::operator=(TemporaryTableHolder && r TemporaryTableHolder::~TemporaryTableHolder() { if (id != UUIDHelpers::Nil) + { + auto table = getTable(); + table->flushAndShutdown(); temporary_tables->dropTable(getContext(), "_tmp_" + toString(id)); + } } StorageID TemporaryTableHolder::getGlobalTableID() const From ca67e67a7432582ee5ff5837d31432ab0e585d56 Mon Sep 17 00:00:00 2001 From: zhanglistar Date: Fri, 27 May 2022 15:52:04 +0800 Subject: [PATCH 08/69] Fix a typo --- src/Interpreters/ActionsDAG.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 2fc9b51674f..eb073ee8752 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -39,7 +39,7 @@ void ActionsDAG::Node::toTree(JSONBuilder::JSONMap & map) const map.add("Result Type", result_type->getName()); if (!result_name.empty()) - map.add("Result Type", magic_enum::enum_name(type)); + map.add("Result Name", result_name); if (column) map.add("Column", column->getName()); From ff228d63e8eece4cdd504f000de1dd2260954941 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 27 May 2022 10:14:13 +0200 Subject: [PATCH 09/69] Fix typo --- docs/en/development/tests.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index f9c7ae37157..be361bd1e3f 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -81,7 +81,7 @@ $ ./src/unit_tests_dbms --gtest_filter=LocalAddress* ## Performance Tests {#performance-tests} -Performance tests allow to measure and compare performance of some isolated part of ClickHouse on synthetic queries. Tests are located at `tests/performance`. Each test is represented by `.xml` file with description of test case. Tests are run with `docker/tests/performance-comparison` tool . See the readme file for invocation. +Performance tests allow to measure and compare performance of some isolated part of ClickHouse on synthetic queries. Tests are located at `tests/performance`. Each test is represented by `.xml` file with description of test case. Tests are run with `docker/test/performance-comparison` tool . See the readme file for invocation. Each test run one or multiple queries (possibly with combinations of parameters) in a loop. From 7ccf4f4db73fc5f28efeb36427a2eae0af05dcc3 Mon Sep 17 00:00:00 2001 From: PigInCloud <44889745+yjant@users.noreply.github.com> Date: Fri, 27 May 2022 18:30:20 +0800 Subject: [PATCH 10/69] Update insert-into.md I translated some untranslated text into Chinese --- .../sql-reference/statements/insert-into.md | 68 ++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/docs/zh/sql-reference/statements/insert-into.md b/docs/zh/sql-reference/statements/insert-into.md index 928107fa2b2..4f958e31b18 100644 --- a/docs/zh/sql-reference/statements/insert-into.md +++ b/docs/zh/sql-reference/statements/insert-into.md @@ -71,7 +71,7 @@ INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... ``` -ClickHouse会清除数据前所有的空白字符与一行摘要信息(如果需要的话)。所以在进行查询时,我们建议您将数据放入到输入输出格式名称后的新的一行中去(如果数据是以空白字符开始的,这将非常重要)。 +ClickHouse会清除数据前所有的空白字符与一个换行符(如果有换行符的话)。所以在进行查询时,我们建议您将数据放入到输入输出格式名称后的新的一行中去(如果数据是以空白字符开始的,这将非常重要)。 示例: @@ -83,6 +83,10 @@ INSERT INTO t FORMAT TabSeparated 在使用命令行客户端或HTTP客户端时,你可以将具体的查询语句与数据分开发送。更多具体信息,请参考«[客户端](../../interfaces/index.md#interfaces)»部分。 +### 限制 {#constraints} + +如果表中有一些[限制](../../sql-reference/statements/create/table.md#constraints),,数据插入时会逐行进行数据校验,如果这里面包含了不符合限制条件的数据,服务将会抛出包含限制信息的异常,这个语句也会被停止执行。 + ### 使用`SELECT`的结果写入 {#insert_query_insert-select} ``` sql @@ -96,6 +100,66 @@ INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... 系统不支持的其他用于修改数据的查询:`UPDATE`, `DELETE`, `REPLACE`, `MERGE`, `UPSERT`, `INSERT UPDATE`。 但是,您可以使用 `ALTER TABLE ... DROP PARTITION`查询来删除一些旧的数据。 +如果 `SELECT` 查询中包含了 [input()](../../sql-reference/table-functions/input.md) 函数,那么 `FORMAT` 必须出现在查询语句的最后。 + +如果某一列限制了值不能是NULL,那么插入NULL的时候就会插入这个列类型的默认数据,可以通过设置 [insert_null_as_default](../../operations/settings/settings.md#insert_null_as_default) 插入NULL。 + +### 从文件向表中插入数据 {#inserting-data-from-a-file} + +``` sql +INSERT INTO [db.]table [(c1, c2, c3)] FROM INFILE file_name [COMPRESSION type] FORMAT format_name +``` +使用上面的语句可以从客户端的文件上读取数据并插入表中,`file_name` 和 `type` 都是 `String` 类型,输入文件的[格式](../../interfaces/formats.md) 一定要在 `FORMAT` 语句中设置。 + +支持读取压缩文件。默认会去读文件的拓展名作为文件的压缩方式,或者也可以在 `COMPRESSION` 语句中指明,支持的文件压缩格式如下:`'none'`, `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`。 + +这个功能在 [command-line client](../../interfaces/cli.md) 和 [clickhouse-local](../../operations/utilities/clickhouse-local.md) 是可用的。 + +**样例** + +```bash +echo 1,A > input.csv ; echo 2,B >> input.csv +clickhouse-client --query="CREATE TABLE table_from_file (id UInt32, text String) ENGINE=MergeTree() ORDER BY id;" +clickhouse-client --query="INSERT INTO table_from_file FROM INFILE 'input.csv' FORMAT CSV;" +clickhouse-client --query="SELECT * FROM table_from_file FORMAT PrettyCompact;" +``` + +结果: + +```text +┌─id─┬─text─┐ +│ 1 │ A │ +│ 2 │ B │ +└────┴──────┘ +``` + +### 插入表函数 {#inserting-into-table-function} + +数据可以通过 [table functions](../../sql-reference/table-functions/index.md) 方法插入。 +``` sql +INSERT INTO [TABLE] FUNCTION table_func ... +``` + +**例如** + +可以这样使用[remote](../../sql-reference/table-functions/index.md#remote) 表函数: + +``` sql +CREATE TABLE simple_table (id UInt32, text String) ENGINE=MergeTree() ORDER BY id; +INSERT INTO TABLE FUNCTION remote('localhost', default.simple_table) + VALUES (100, 'inserted via remote()'); +SELECT * FROM simple_table; +``` + +结果: + +``` text +┌──id─┬─text──────────────────┐ +│ 100 │ inserted via remote() │ +└─────┴───────────────────────┘ +``` + + ### 性能的注意事项 {#xing-neng-de-zhu-yi-shi-xiang} 在进行`INSERT`时将会对写入的数据进行一些处理,按照主键排序,按照月份对数据进行分区等。所以如果在您的写入数据中包含多个月份的混合数据时,将会显著的降低`INSERT`的性能。为了避免这种情况: @@ -108,4 +172,6 @@ INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... - 数据总是被实时的写入。 - 写入的数据已经按照时间排序。 +也可以异步的、小规模的插入数据,这些数据会被合并成多个批次,然后安全地写入到表中,通过设置[async_insert](../../operations/settings/settings.md#async-insert),可以使用异步插入的方式,请注意,异步插入的方式只支持HTTP协议,并且不支持数据去重。 + [来源文章](https://clickhouse.com/docs/en/query_language/insert_into/) From 60b9d81773d5a56486c5685ec3880d6caf207d1f Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Fri, 27 May 2022 16:30:29 +0000 Subject: [PATCH 11/69] Remove global_memory_usage_overcommit_max_wait_microseconds --- .../settings.md | 10 ----- docs/en/operations/settings/settings.md | 2 +- programs/server/Server.cpp | 2 - src/Common/MemoryTracker.cpp | 8 ++++ src/Common/MemoryTracker.h | 10 +++++ src/Common/OvercommitTracker.cpp | 11 ++---- src/Common/OvercommitTracker.h | 4 -- src/Common/tests/gtest_overcommit_tracker.cpp | 38 ++++++++++++------- src/Core/Settings.h | 2 +- src/Interpreters/ProcessList.cpp | 3 +- .../configs/global_overcommit_tracker.xml | 1 - .../test_global_overcommit_tracker/test.py | 12 ++++-- 12 files changed, 57 insertions(+), 46 deletions(-) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index fd5c2a187b5..f235fba84f7 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1745,13 +1745,3 @@ Possible values: - Positive integer. Default value: `10000`. - -## global_memory_usage_overcommit_max_wait_microseconds {#global_memory_usage_overcommit_max_wait_microseconds} - -Sets maximum waiting time for global overcommit tracker. - -Possible values: - -- Positive integer. - -Default value: `200`. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 76fbc5f239d..9367d70507f 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4279,7 +4279,7 @@ Maximum time thread will wait for memory to be freed in the case of memory overc If the timeout is reached and memory is not freed, an exception is thrown. Read more about [memory overcommit](memory-overcommit.md). -Default value: `200`. +Default value: `5000000`. ## memory_overcommit_ratio_denominator_for_user diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 18ab96983eb..2c6ffccd39d 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1095,8 +1095,6 @@ int Server::main(const std::vector & /*args*/) total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking); auto * global_overcommit_tracker = global_context->getGlobalOvercommitTracker(); - UInt64 max_overcommit_wait_time = config->getUInt64("global_memory_usage_overcommit_max_wait_microseconds", 5'000'000); - global_overcommit_tracker->setMaxWaitTime(max_overcommit_wait_time); total_memory_tracker.setOvercommitTracker(global_overcommit_tracker); // FIXME logging-related things need synchronization -- see the 'Logger * log' saved diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index b5a27543b4e..51f4c83dc23 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -82,6 +82,8 @@ namespace ProfileEvents extern const Event QueryMemoryLimitExceeded; } +using namespace std::chrono_literals; + static constexpr size_t log_peak_memory_usage_every = 1ULL << 30; MemoryTracker total_memory_tracker(nullptr, VariableContext::Global); @@ -363,6 +365,12 @@ OvercommitRatio MemoryTracker::getOvercommitRatio(Int64 limit) } +void MemoryTracker::setOvercommitWaitingTime(UInt64 wait_time) +{ + max_wait_time.store(wait_time * 1us, std::memory_order_relaxed); +} + + void MemoryTracker::resetCounters() { amount.store(0, std::memory_order_relaxed); diff --git a/src/Common/MemoryTracker.h b/src/Common/MemoryTracker.h index 73af2ab8857..58bd3a460bd 100644 --- a/src/Common/MemoryTracker.h +++ b/src/Common/MemoryTracker.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -73,6 +74,8 @@ private: /// This description will be used as prefix into log messages (if isn't nullptr) std::atomic description_ptr = nullptr; + std::atomic max_wait_time; + std::atomic overcommit_tracker = nullptr; bool updatePeak(Int64 will_be, bool log_memory_usage); @@ -186,6 +189,13 @@ public: OvercommitRatio getOvercommitRatio(); OvercommitRatio getOvercommitRatio(Int64 limit); + std::chrono::microseconds getOvercommitWaitingTime() + { + return max_wait_time.load(std::memory_order_relaxed); + } + + void setOvercommitWaitingTime(UInt64 wait_time); + void setOvercommitTracker(OvercommitTracker * tracker) noexcept { overcommit_tracker.store(tracker, std::memory_order_relaxed); diff --git a/src/Common/OvercommitTracker.cpp b/src/Common/OvercommitTracker.cpp index 0c03ba58e87..3cef72eb8b4 100644 --- a/src/Common/OvercommitTracker.cpp +++ b/src/Common/OvercommitTracker.cpp @@ -15,8 +15,7 @@ using namespace std::chrono_literals; constexpr std::chrono::microseconds ZERO_MICROSEC = 0us; OvercommitTracker::OvercommitTracker(std::mutex & global_mutex_) - : max_wait_time(ZERO_MICROSEC) - , picked_tracker(nullptr) + : picked_tracker(nullptr) , cancellation_state(QueryCancellationState::NONE) , global_mutex(global_mutex_) , freed_memory(0) @@ -24,12 +23,6 @@ OvercommitTracker::OvercommitTracker(std::mutex & global_mutex_) , allow_release(true) {} -void OvercommitTracker::setMaxWaitTime(UInt64 wait_time) -{ - std::lock_guard guard(overcommit_m); - max_wait_time = wait_time * 1us; -} - OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount) { // NOTE: Do not change the order of locks @@ -41,6 +34,8 @@ OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int std::unique_lock global_lock(global_mutex); std::unique_lock lk(overcommit_m); + auto max_wait_time = tracker->getOvercommitWaitingTime(); + if (max_wait_time == ZERO_MICROSEC) return OvercommitResult::DISABLED; diff --git a/src/Common/OvercommitTracker.h b/src/Common/OvercommitTracker.h index 79ed36cd7fa..80aaed68e37 100644 --- a/src/Common/OvercommitTracker.h +++ b/src/Common/OvercommitTracker.h @@ -62,8 +62,6 @@ enum class QueryCancellationState // is killed to free memory. struct OvercommitTracker : boost::noncopyable { - void setMaxWaitTime(UInt64 wait_time); - OvercommitResult needToStopQuery(MemoryTracker * tracker, Int64 amount); void tryContinueQueryExecutionAfterFree(Int64 amount); @@ -82,8 +80,6 @@ protected: std::mutex overcommit_m; std::condition_variable cv; - std::chrono::microseconds max_wait_time; - // Specifies memory tracker of the chosen to stop query. // If soft limit is not set, all the queries which reach hard limit must stop. // This case is represented as picked tracker pointer is set to nullptr and diff --git a/src/Common/tests/gtest_overcommit_tracker.cpp b/src/Common/tests/gtest_overcommit_tracker.cpp index c56ecec669f..d832a73ffd9 100644 --- a/src/Common/tests/gtest_overcommit_tracker.cpp +++ b/src/Common/tests/gtest_overcommit_tracker.cpp @@ -40,15 +40,17 @@ static constexpr UInt64 WAIT_TIME = 4'000'000; template void free_not_continue_test(T & overcommit_tracker) { - overcommit_tracker.setMaxWaitTime(WAIT_TIME); - static constexpr size_t THREADS = 5; std::vector trackers(THREADS); + for (auto & tracker : trackers) + tracker.setOvercommitWaitingTime(WAIT_TIME); + std::atomic need_to_stop = 0; std::vector threads; threads.reserve(THREADS); MemoryTracker picked; + picked.setOvercommitWaitingTime(WAIT_TIME); overcommit_tracker.setCandidate(&picked); for (size_t i = 0; i < THREADS; ++i) @@ -96,15 +98,16 @@ TEST(OvercommitTracker, GlobalFreeNotContinue) template void free_continue_test(T & overcommit_tracker) { - overcommit_tracker.setMaxWaitTime(WAIT_TIME); - static constexpr size_t THREADS = 5; std::vector trackers(THREADS); + for (auto & tracker : trackers) + tracker.setOvercommitWaitingTime(WAIT_TIME); std::atomic need_to_stop = 0; std::vector threads; threads.reserve(THREADS); MemoryTracker picked; + picked.setOvercommitWaitingTime(WAIT_TIME); overcommit_tracker.setCandidate(&picked); for (size_t i = 0; i < THREADS; ++i) @@ -152,15 +155,16 @@ TEST(OvercommitTracker, GlobalFreeContinue) template void free_continue_and_alloc_test(T & overcommit_tracker) { - overcommit_tracker.setMaxWaitTime(WAIT_TIME); - static constexpr size_t THREADS = 5; std::vector trackers(THREADS); + for (auto & tracker : trackers) + tracker.setOvercommitWaitingTime(WAIT_TIME); std::atomic need_to_stop = 0; std::vector threads; threads.reserve(THREADS); MemoryTracker picked; + picked.setOvercommitWaitingTime(WAIT_TIME); overcommit_tracker.setCandidate(&picked); for (size_t i = 0; i < THREADS; ++i) @@ -179,6 +183,7 @@ void free_continue_and_alloc_test(T & overcommit_tracker) [&]() { MemoryTracker failed; + failed.setOvercommitWaitingTime(WAIT_TIME); std::this_thread::sleep_for(1000ms); overcommit_tracker.tryContinueQueryExecutionAfterFree(5000); stopped_next = overcommit_tracker.needToStopQuery(&failed, 100) != OvercommitResult::MEMORY_FREED; @@ -212,15 +217,16 @@ TEST(OvercommitTracker, GlobalFreeContinueAndAlloc) template void free_continue_and_alloc_2_test(T & overcommit_tracker) { - overcommit_tracker.setMaxWaitTime(WAIT_TIME); - static constexpr size_t THREADS = 5; std::vector trackers(THREADS); + for (auto & tracker : trackers) + tracker.setOvercommitWaitingTime(WAIT_TIME); std::atomic need_to_stop = 0; std::vector threads; threads.reserve(THREADS); MemoryTracker picked; + picked.setOvercommitWaitingTime(WAIT_TIME); overcommit_tracker.setCandidate(&picked); for (size_t i = 0; i < THREADS; ++i) @@ -239,6 +245,7 @@ void free_continue_and_alloc_2_test(T & overcommit_tracker) [&]() { MemoryTracker failed; + failed.setOvercommitWaitingTime(WAIT_TIME); std::this_thread::sleep_for(1000ms); overcommit_tracker.tryContinueQueryExecutionAfterFree(5000); stopped_next = overcommit_tracker.needToStopQuery(&failed, 100) != OvercommitResult::MEMORY_FREED; @@ -280,15 +287,16 @@ TEST(OvercommitTracker, GlobalFreeContinueAndAlloc2) template void free_continue_and_alloc_3_test(T & overcommit_tracker) { - overcommit_tracker.setMaxWaitTime(WAIT_TIME); - static constexpr size_t THREADS = 5; std::vector trackers(THREADS); + for (auto & tracker : trackers) + tracker.setOvercommitWaitingTime(WAIT_TIME); std::atomic need_to_stop = 0; std::vector threads; threads.reserve(THREADS); MemoryTracker picked; + picked.setOvercommitWaitingTime(WAIT_TIME); overcommit_tracker.setCandidate(&picked); for (size_t i = 0; i < THREADS; ++i) @@ -307,6 +315,7 @@ void free_continue_and_alloc_3_test(T & overcommit_tracker) [&]() { MemoryTracker failed; + failed.setOvercommitWaitingTime(WAIT_TIME); std::this_thread::sleep_for(1000ms); overcommit_tracker.tryContinueQueryExecutionAfterFree(5000); stopped_next = overcommit_tracker.needToStopQuery(&failed, 100) != OvercommitResult::MEMORY_FREED; @@ -348,15 +357,16 @@ TEST(OvercommitTracker, GlobalFreeContinueAndAlloc3) template void free_continue_2_test(T & overcommit_tracker) { - overcommit_tracker.setMaxWaitTime(WAIT_TIME); - static constexpr size_t THREADS = 5; std::vector trackers(THREADS); + for (auto & tracker : trackers) + tracker.setOvercommitWaitingTime(WAIT_TIME); std::atomic need_to_stop = 0; std::vector threads; threads.reserve(THREADS); MemoryTracker picked; + picked.setOvercommitWaitingTime(WAIT_TIME); overcommit_tracker.setCandidate(&picked); for (size_t i = 0; i < THREADS; ++i) @@ -404,14 +414,14 @@ TEST(OvercommitTracker, GlobalFreeContinue2) template void query_stop_not_continue_test(T & overcommit_tracker) { - overcommit_tracker.setMaxWaitTime(WAIT_TIME); - std::atomic need_to_stop = 0; MemoryTracker picked; + picked.setOvercommitWaitingTime(WAIT_TIME); overcommit_tracker.setCandidate(&picked); MemoryTracker another; + another.setOvercommitWaitingTime(WAIT_TIME); auto thread = std::thread( [&]() { diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 29427c673ac..9111e1d80da 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -371,7 +371,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, memory_profiler_step, (4 * 1024 * 1024), "Whenever query memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stack trace. Zero means disabled memory profiler. Values lower than a few megabytes will slow down query processing.", 0) \ M(Float, memory_profiler_sample_probability, 0., "Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless to the size of the allocation. Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine grained sampling.", 0) \ \ - M(UInt64, memory_usage_overcommit_max_wait_microseconds, 200, "Maximum time thread will wait for memory to be freed in the case of memory overcommit on user level. If timeout is reached and memory is not freed, exception is thrown.", 0) \ + M(UInt64, memory_usage_overcommit_max_wait_microseconds, 5'000'000, "Maximum time thread will wait for memory to be freed in the case of memory overcommit on user level. If timeout is reached and memory is not freed, exception is thrown.", 0) \ \ M(UInt64, max_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for a query. Zero means unlimited.", 0) \ M(UInt64, max_network_bytes, 0, "The maximum number of bytes (compressed) to receive or transmit over the network for execution of the query.", 0) \ diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index 6c101143234..6b8894414ee 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -225,6 +225,8 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as if (settings.memory_tracker_fault_probability) thread_group->memory_tracker.setFaultProbability(settings.memory_tracker_fault_probability); + thread_group->memory_tracker.setOvercommitWaitingTime(settings.memory_usage_overcommit_max_wait_microseconds); + /// NOTE: Do not set the limit for thread-level memory tracker since it could show unreal values /// since allocation and deallocation could happen in different threads } @@ -244,7 +246,6 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as user_process_list.user_memory_tracker.setOrRaiseHardLimit(settings.max_memory_usage_for_user); user_process_list.user_memory_tracker.setSoftLimit(settings.memory_overcommit_ratio_denominator_for_user); user_process_list.user_memory_tracker.setDescription("(for user)"); - user_process_list.user_overcommit_tracker.setMaxWaitTime(settings.memory_usage_overcommit_max_wait_microseconds); if (!user_process_list.user_throttler) { diff --git a/tests/integration/test_global_overcommit_tracker/configs/global_overcommit_tracker.xml b/tests/integration/test_global_overcommit_tracker/configs/global_overcommit_tracker.xml index 590759bd15d..a05d8865a6b 100644 --- a/tests/integration/test_global_overcommit_tracker/configs/global_overcommit_tracker.xml +++ b/tests/integration/test_global_overcommit_tracker/configs/global_overcommit_tracker.xml @@ -1,4 +1,3 @@ 50000000 - 500 \ No newline at end of file diff --git a/tests/integration/test_global_overcommit_tracker/test.py b/tests/integration/test_global_overcommit_tracker/test.py index d3d56e82f38..093549249ce 100644 --- a/tests/integration/test_global_overcommit_tracker/test.py +++ b/tests/integration/test_global_overcommit_tracker/test.py @@ -18,8 +18,8 @@ def start_cluster(): cluster.shutdown() -TEST_QUERY_A = "SELECT number FROM numbers(1000) GROUP BY number SETTINGS memory_overcommit_ratio_denominator_for_user=1" -TEST_QUERY_B = "SELECT number FROM numbers(1000) GROUP BY number SETTINGS memory_overcommit_ratio_denominator_for_user=2" +TEST_QUERY_A = "SELECT number FROM numbers(1000) GROUP BY number SETTINGS memory_overcommit_ratio_denominator_for_user=1, memory_usage_overcommit_max_wait_microseconds=500" +TEST_QUERY_B = "SELECT number FROM numbers(1000) GROUP BY number SETTINGS memory_overcommit_ratio_denominator_for_user=2, memory_usage_overcommit_max_wait_microseconds=500" def test_overcommited_is_killed(): @@ -46,8 +46,12 @@ def test_overcommited_is_killed(): finished = True assert ( - overcommited_killed and finished - ), "no overcommited task was killed or all tasks are killed" + overcommited_killed + ), "no overcommited task was killed" + + assert ( + finished + ), "all tasks are killed" node.query("DROP USER IF EXISTS A") node.query("DROP USER IF EXISTS B") From 41ef0044f068f02d9fac278ec07dad6826825687 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Fri, 27 May 2022 13:43:34 -0400 Subject: [PATCH 12/69] endpoint is added --- src/Dictionaries/HTTPDictionarySource.cpp | 13 ++++++++++++- src/Storages/ExternalDataSourceConfiguration.cpp | 2 ++ src/Storages/ExternalDataSourceConfiguration.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/Dictionaries/HTTPDictionarySource.cpp b/src/Dictionaries/HTTPDictionarySource.cpp index 8f7ca5e7a51..17592a8d9da 100644 --- a/src/Dictionaries/HTTPDictionarySource.cpp +++ b/src/Dictionaries/HTTPDictionarySource.cpp @@ -233,6 +233,7 @@ void registerDictionarySourceHTTP(DictionarySourceFactory & factory) Poco::Net::HTTPBasicCredentials credentials; ReadWriteBufferFromHTTP::HTTPHeaderEntries header_entries; String url; + String endpoint; String format; auto named_collection = created_from_ddl @@ -241,6 +242,7 @@ void registerDictionarySourceHTTP(DictionarySourceFactory & factory) if (named_collection) { url = named_collection->configuration.url; + endpoint = named_collection->configuration.endpoint; format = named_collection->configuration.format; credentials.setUsername(named_collection->configuration.user); @@ -278,12 +280,21 @@ void registerDictionarySourceHTTP(DictionarySourceFactory & factory) } url = config.getString(settings_config_prefix + ".url", ""); + endpoint = config.getString(settings_config_prefix + ".endpoint", ""); format =config.getString(settings_config_prefix + ".format", ""); } + if (url.ends_with('/')) + { + if (endpoint.starts_with('/')) + url.pop_back(); + } + else if (!endpoint.empty() && !endpoint.starts_with('/')) + url.push_back('/'); + auto configuration = HTTPDictionarySource::Configuration { - .url = url, + .url = url + endpoint, .format = format, .update_field = config.getString(settings_config_prefix + ".update_field", ""), .update_lag = config.getUInt64(settings_config_prefix + ".update_lag", 1), diff --git a/src/Storages/ExternalDataSourceConfiguration.cpp b/src/Storages/ExternalDataSourceConfiguration.cpp index 55eff117d5e..f916ac8c2af 100644 --- a/src/Storages/ExternalDataSourceConfiguration.cpp +++ b/src/Storages/ExternalDataSourceConfiguration.cpp @@ -263,6 +263,8 @@ std::optional getURLBasedDataSourceConfiguration( configuration.url = dict_config.getString(dict_config_prefix + ".url", config.getString(collection_prefix + ".url", "")); + configuration.endpoint = + dict_config.getString(dict_config_prefix + ".endpoint", config.getString(collection_prefix + ".endpoint", "")); configuration.format = dict_config.getString(dict_config_prefix + ".format", config.getString(collection_prefix + ".format", "")); configuration.compression_method = diff --git a/src/Storages/ExternalDataSourceConfiguration.h b/src/Storages/ExternalDataSourceConfiguration.h index 19301c360f0..4ed46e1b26c 100644 --- a/src/Storages/ExternalDataSourceConfiguration.h +++ b/src/Storages/ExternalDataSourceConfiguration.h @@ -99,6 +99,7 @@ getExternalDataSourceConfigurationByPriority(const Poco::Util::AbstractConfigura struct URLBasedDataSourceConfiguration { String url; + String endpoint; String format = "auto"; String compression_method = "auto"; String structure = "auto"; From db2fe33926af47aad3155407a5443768a307dd43 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Fri, 27 May 2022 20:43:59 +0200 Subject: [PATCH 13/69] Update test.py --- tests/integration/test_global_overcommit_tracker/test.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/integration/test_global_overcommit_tracker/test.py b/tests/integration/test_global_overcommit_tracker/test.py index 093549249ce..871f9ca983e 100644 --- a/tests/integration/test_global_overcommit_tracker/test.py +++ b/tests/integration/test_global_overcommit_tracker/test.py @@ -45,13 +45,8 @@ def test_overcommited_is_killed(): if err == "": finished = True - assert ( - overcommited_killed - ), "no overcommited task was killed" - - assert ( - finished - ), "all tasks are killed" + assert overcommited_killed, "no overcommited task was killed" + assert finished, "all tasks are killed" node.query("DROP USER IF EXISTS A") node.query("DROP USER IF EXISTS B") From 3ff32fe81dcdb86d555917e9ed5f7437996e4579 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 28 May 2022 12:30:05 -0400 Subject: [PATCH 14/69] test is added --- .../integration/test_storage_dict/__init__.py | 0 .../test_storage_dict/configs/conf.xml | 16 ++++++++ tests/integration/test_storage_dict/test.py | 40 +++++++++++++++++++ 3 files changed, 56 insertions(+) create mode 100644 tests/integration/test_storage_dict/__init__.py create mode 100644 tests/integration/test_storage_dict/configs/conf.xml create mode 100644 tests/integration/test_storage_dict/test.py diff --git a/tests/integration/test_storage_dict/__init__.py b/tests/integration/test_storage_dict/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_storage_dict/configs/conf.xml b/tests/integration/test_storage_dict/configs/conf.xml new file mode 100644 index 00000000000..c2ecb518884 --- /dev/null +++ b/tests/integration/test_storage_dict/configs/conf.xml @@ -0,0 +1,16 @@ + + + + + http://nginx:80/test_dict + PUT + TSV + k String, v String + + + http://nginx:80/ + /test_dict + TabSeparated + + + diff --git a/tests/integration/test_storage_dict/test.py b/tests/integration/test_storage_dict/test.py new file mode 100644 index 00000000000..df224f08968 --- /dev/null +++ b/tests/integration/test_storage_dict/test.py @@ -0,0 +1,40 @@ +import pytest + +from helpers.cluster import ClickHouseCluster + +uuids = [] + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node1", main_configs=["configs/conf.xml"], with_nginx=True + ) + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + + +def test_storage_dict(cluster): + node1 = cluster.instances["node1"] + + node1.query( + f"insert into table function url(urldb) values ('foo', 'bar')" + ) + result = node1.query( + f"select * from url(urldb)" + ) + assert result.strip() == "foo\tbar" + + node1.query( + f"create dictionary dict (k String, v String) primary key k source(http(name urldict)) layout(complex_key_hashed()) lifetime(min 0 max 100)" + ) + result = node1.query( + f"select * from dict" + ) + assert result.strip() == "foo\tbar" From 4f07c684da00ead69d24f0aa04b68f64b39b72db Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 28 May 2022 12:45:53 -0400 Subject: [PATCH 15/69] style fix --- tests/integration/test_storage_dict/test.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/integration/test_storage_dict/test.py b/tests/integration/test_storage_dict/test.py index df224f08968..a5270a42114 100644 --- a/tests/integration/test_storage_dict/test.py +++ b/tests/integration/test_storage_dict/test.py @@ -23,18 +23,12 @@ def cluster(): def test_storage_dict(cluster): node1 = cluster.instances["node1"] - node1.query( - f"insert into table function url(urldb) values ('foo', 'bar')" - ) - result = node1.query( - f"select * from url(urldb)" - ) + node1.query(f"insert into table function url(urldb) values ('foo', 'bar')") + result = node1.query(f"select * from url(urldb)") assert result.strip() == "foo\tbar" node1.query( f"create dictionary dict (k String, v String) primary key k source(http(name urldict)) layout(complex_key_hashed()) lifetime(min 0 max 100)" ) - result = node1.query( - f"select * from dict" - ) + result = node1.query(f"select * from dict") assert result.strip() == "foo\tbar" From e2dd6f62495ee4056d9289028a93fada892cc147 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Mon, 30 May 2022 19:58:23 +0200 Subject: [PATCH 16/69] Removed prewhere_info.alias_actions --- src/Interpreters/ExpressionAnalyzer.cpp | 1 - src/Interpreters/InterpreterSelectQuery.cpp | 22 ------------------- src/Storages/IStorage.cpp | 5 ----- .../MergeTreeBaseSelectProcessor.cpp | 5 ----- .../MergeTree/MergeTreeBlockReadUtils.cpp | 19 ++++++---------- src/Storages/MergeTree/MergeTreeData.cpp | 13 ----------- .../MergeTree/MergeTreeRangeReader.cpp | 6 ----- src/Storages/MergeTree/MergeTreeRangeReader.h | 2 -- src/Storages/SelectQueryInfo.h | 2 -- src/Storages/StorageBuffer.cpp | 9 -------- 10 files changed, 7 insertions(+), 77 deletions(-) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 0b1154f6fd1..9ac0fe46553 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -1970,7 +1970,6 @@ void ExpressionAnalysisResult::checkActions() const }; check_actions(prewhere_info->prewhere_actions); - check_actions(prewhere_info->alias_actions); } } diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index d143295181e..ebdd9612895 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1625,15 +1625,6 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan( { auto & prewhere_info = *prewhere_info_ptr; - if (prewhere_info.alias_actions) - { - pipe.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, - std::make_shared(prewhere_info.alias_actions)); - }); - } - if (prewhere_info.row_level_filter) { pipe.addSimpleTransform([&](const Block & header) @@ -1873,19 +1864,6 @@ void InterpreterSelectQuery::addPrewhereAliasActions() for (const auto & name : required_columns) prewhere_info->prewhere_actions->tryRestoreColumn(name); - auto analyzed_result - = TreeRewriter(context).analyze(required_columns_from_prewhere_expr, metadata_snapshot->getColumns().getAllPhysical()); - prewhere_info->alias_actions - = ExpressionAnalyzer(required_columns_from_prewhere_expr, analyzed_result, context).getActionsDAG(true, false); - - /// Add (physical?) columns required by alias actions. - auto required_columns_from_alias = prewhere_info->alias_actions->getRequiredColumns(); - Block prewhere_actions_result = prewhere_info->prewhere_actions->getResultColumns(); - for (auto & column : required_columns_from_alias) - if (!prewhere_actions_result.has(column.name)) - if (required_columns.end() == std::find(required_columns.begin(), required_columns.end(), column.name)) - required_columns.push_back(column.name); - /// Add physical columns required by prewhere actions. for (const auto & column : required_columns_from_prewhere) if (!required_aliases_from_prewhere.contains(column)) diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index f236cb5e98c..88d60e00b9c 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -231,11 +231,6 @@ std::string PrewhereInfo::dump() const WriteBufferFromOwnString ss; ss << "PrewhereDagInfo\n"; - if (alias_actions) - { - ss << "alias_actions " << alias_actions->dumpDAG() << "\n"; - } - if (prewhere_actions) { ss << "prewhere_actions " << prewhere_actions->dumpDAG() << "\n"; diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index b8a3f0b1d1f..b9158bde6f1 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -72,8 +72,6 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( if (prewhere_info) { prewhere_actions = std::make_unique(); - if (prewhere_info->alias_actions) - prewhere_actions->alias_actions = std::make_shared(prewhere_info->alias_actions, actions_settings); if (prewhere_info->row_level_filter) prewhere_actions->row_level_filter = std::make_shared(prewhere_info->row_level_filter, actions_settings); @@ -556,9 +554,6 @@ Block MergeTreeBaseSelectProcessor::transformHeader( { if (prewhere_info) { - if (prewhere_info->alias_actions) - block = prewhere_info->alias_actions->updateHeader(std::move(block)); - if (prewhere_info->row_level_filter) { block = prewhere_info->row_level_filter->updateHeader(std::move(block)); diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index 5cc22503348..f74823eaec2 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -281,21 +281,16 @@ MergeTreeReadTaskColumns getReadTaskColumns( if (prewhere_info) { - if (prewhere_info->alias_actions) - pre_column_names = prewhere_info->alias_actions->getRequiredColumnsNames(); - else + pre_column_names = prewhere_info->prewhere_actions->getRequiredColumnsNames(); + + if (prewhere_info->row_level_filter) { - pre_column_names = prewhere_info->prewhere_actions->getRequiredColumnsNames(); + NameSet names(pre_column_names.begin(), pre_column_names.end()); - if (prewhere_info->row_level_filter) + for (auto & name : prewhere_info->row_level_filter->getRequiredColumnsNames()) { - NameSet names(pre_column_names.begin(), pre_column_names.end()); - - for (auto & name : prewhere_info->row_level_filter->getRequiredColumnsNames()) - { - if (!names.contains(name)) - pre_column_names.push_back(name); - } + if (!names.contains(name)) + pre_column_names.push_back(name); } } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 7726a752cbe..a38412d5c8f 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5439,17 +5439,6 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg candidate.prewhere_info->row_level_filter = row_level_filter_actions; } - if (candidate.prewhere_info->alias_actions) - { - auto alias_actions = candidate.prewhere_info->alias_actions->clone(); - // alias_action should not add missing keys. - auto new_prewhere_required_columns - = alias_actions->foldActionsByProjection(prewhere_required_columns, projection.sample_block_for_keys, {}, false); - if (new_prewhere_required_columns.empty() && !prewhere_required_columns.empty()) - return false; - prewhere_required_columns = std::move(new_prewhere_required_columns); - candidate.prewhere_info->alias_actions = alias_actions; - } required_columns.insert(prewhere_required_columns.begin(), prewhere_required_columns.end()); } @@ -5619,8 +5608,6 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg if (minmax_count_projection_candidate->prewhere_info) { const auto & prewhere_info = minmax_count_projection_candidate->prewhere_info; - if (prewhere_info->alias_actions) - ExpressionActions(prewhere_info->alias_actions, actions_settings).execute(query_info.minmax_count_projection_block); if (prewhere_info->row_level_filter) { diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 7f9d1414886..addd9bdf7e0 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -564,9 +564,6 @@ MergeTreeRangeReader::MergeTreeRangeReader( if (prewhere_info) { - if (prewhere_info->alias_actions) - prewhere_info->alias_actions->execute(sample_block, true); - if (prewhere_info->row_level_filter) { prewhere_info->row_level_filter->execute(sample_block, true); @@ -1029,9 +1026,6 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r ++pos; } - if (prewhere_info->alias_actions) - prewhere_info->alias_actions->execute(block); - /// Columns might be projected out. We need to store them here so that default columns can be evaluated later. result.block_before_prewhere = block; diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index ed5cc16add8..21ed35e6a78 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -21,8 +21,6 @@ using ExpressionActionsPtr = std::shared_ptr; /// The same as PrewhereInfo, but with ExpressionActions instead of ActionsDAG struct PrewhereExprInfo { - /// Actions which are executed in order to alias columns are used for prewhere actions. - ExpressionActionsPtr alias_actions; /// Actions for row level security filter. Applied separately before prewhere_actions. /// This actions are separate because prewhere condition should not be executed over filtered rows. ExpressionActionsPtr row_level_filter; diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index b6643754db7..80194c5573f 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -48,8 +48,6 @@ using SubqueriesForSets = std::unordered_map; struct PrewhereInfo { - /// Actions which are executed in order to alias columns are used for prewhere actions. - ActionsDAGPtr alias_actions; /// Actions for row level security filter. Applied separately before prewhere_actions. /// This actions are separate because prewhere condition should not be executed over filtered rows. ActionsDAGPtr row_level_filter; diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index e551abe0065..f54775a8706 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -383,15 +383,6 @@ void StorageBuffer::read( if (query_info.prewhere_info) { auto actions_settings = ExpressionActionsSettings::fromContext(local_context); - if (query_info.prewhere_info->alias_actions) - { - pipe_from_buffers.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, - std::make_shared(query_info.prewhere_info->alias_actions, actions_settings)); - }); - } if (query_info.prewhere_info->row_level_filter) { From ad12adc31c5a5f440f52fdd72f5ae41880980920 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 27 May 2022 12:40:53 +0200 Subject: [PATCH 17/69] Measure and rework internal re2 caching This commit is based on local benchmarks of ClickHouse's re2 caching. Question 1: ----------------------------------------------------------- Is pattern caching useful for queries with const LIKE/REGEX patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T; The short answer is: no. Runtime is (unsurprisingly) dominated by pattern evaluation + other stuff going on in queries, but definitely not pattern compilation. For space reasons, I omit details of the local experiments. (Side note: the current caching scheme is unbounded in size which poses a DoS risk (think of multi-tenancy). This risk is more pronounced when unbounded caching is used with non-const patterns ..., see next question) Question 2: ----------------------------------------------------------- Is pattern caching useful for queries with non-const LIKE/REGEX patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T; I benchmarked five caching strategies: 1. no caching as a baseline (= recompile for each row) 2. unbounded cache (= threadsafe global hash-map) 3. LRU cache (= threadsafe global hash-map + LRU queue) 4. lightweight local cache 1 (= not threadsafe local hashmap with collision list which grows to a certain size (here: 10 elements) and afterwards never changes) 5. lightweight local cache 2 (not threadsafe local hashmap without collision list in which a collision replaces the stored element, idea by Alexey) ... using a haystack of 2 mio strings and A). 2 mio distinct simple patterns B). 10 simple patterns C) 2 mio distinct complex patterns D) 10 complex patterns Fo A) and C), caching does not help but these queries still allow to judge the static overhead of caching on query runtimes. B) and D) are extreme but common cases in practice. They include queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' : '%pattern2%'). Caching should help significantly. Because LIKE patterns are internally translated to re2 expressions, I show only measurements for MATCH queries. Results in sec, averaged over on multiple measurements; 1.A): 2.12 B): 1.68 C): 9.75 D): 9.45 2.A): 2.17 B): 1.73 C): 9.78 D): 9.47 3.A): 9.8 B): 0.63 C): 31.8 D): 0.98 4.A): 2.14 B): 0.29 C): 9.82 D): 0.41 5.A) 2.12 / 2.15 / 2.26 B) 1.51 / 0.43 / 0.30 C) 9.97 / 9.88 / 10.13 D) 5.70 / 0.42 / 0.43 (10/100/1000 buckets, resp. 10/1/0.1% collision rate) Evaluation: 1. This is the baseline. It was surprised that complex patterns (C, D) slow down the queries so badly compared to simple patterns (A, B). The runtime includes evaluation costs, but as caching only helps with compilation, and looking at 4.D and 5.D, compilation makes up over 90% of the runtime! 2. No speedup compared to 1, probably due to locking overhead. The cache is unbounded, and in experiments with data sets > 2 mio rows, 2. is the only scheme to throw OOM exceptions which is not acceptable. 3. Unique patterns (A and C) lead to thrashing of the LRU cache and very bad runtimes due to LRU queue maintenance and locking. Works pretty well however with few distinct patterns (B and D). 4. This scheme is tailored to queries B and D where it performs pretty good. More importantly, the caching is lightweight enough to not deteriorate performance on datasets A and C. 5. After some tuning of the hash map size, 100 buckets seem optimal to be in the same ballpark with 10 distinct patterns as 4. Performance also does not deteriorate on A and C compared to the baseline. Unlike 4., this scheme behaves LRU-like and can adjust to changing pattern distributions. As a conclusion, this commit implementes two things: 1. Based on Q1, pattern search with const needle no longer uses caching. This applies to LIKE and MATCH + a few (exotic) other SQL functions. The code for the unbounded caching was removed. 2. Based on Q2, pattern search with non-const needles now use method 5. --- src/Functions/FunctionsStringArray.h | 4 +- src/Functions/MatchImpl.h | 30 +++++--- src/Functions/Regexps.h | 73 ++++++++++++++----- src/Functions/countMatches.h | 6 +- src/Functions/extract.cpp | 6 +- src/Functions/extractAllGroups.h | 4 +- src/Functions/extractGroups.cpp | 4 +- .../like_and_match_pattern_caching.xml | 62 ++++++++++++++++ 8 files changed, 145 insertions(+), 44 deletions(-) create mode 100644 tests/performance/like_and_match_pattern_caching.xml diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 6545c3e3549..b0f415be58a 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -477,7 +477,7 @@ public: ErrorCodes::ILLEGAL_COLUMN); if (!col->getValue().empty()) - re = Regexps::get(col->getValue()); + re = std::make_shared(Regexps::createRegexp(col->getValue())); } @@ -560,7 +560,7 @@ public: + " of first argument of function " + getName() + ". Must be constant string.", ErrorCodes::ILLEGAL_COLUMN); - re = Regexps::get(col->getValue()); + re = std::make_shared(Regexps::createRegexp(col->getValue())); capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0; matches.resize(capture + 1); diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index 9779eb8d608..96ce0ca2eb0 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -166,17 +166,17 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle)); String required_substring; bool is_trivial; bool required_substring_is_prefix; /// for `anchored` execution of the regexp. - regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); + regexp.getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); if (required_substring.empty()) { - if (!regexp->getRE2()) /// An empty regexp. Always matches. + if (!regexp.getRE2()) /// An empty regexp. Always matches. { if (haystack_size) memset(res.data(), !negate, haystack_size * sizeof(res[0])); @@ -186,7 +186,7 @@ struct MatchImpl size_t prev_offset = 0; for (size_t i = 0; i < haystack_size; ++i) { - const bool match = regexp->getRE2()->Match( + const bool match = regexp.getRE2()->Match( {reinterpret_cast(&haystack_data[prev_offset]), haystack_offsets[i] - prev_offset - 1}, 0, haystack_offsets[i] - prev_offset - 1, @@ -241,7 +241,7 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (reinterpret_cast(pos) - str_data) : 0; const size_t end_pos = str_size; - const bool match = regexp->getRE2()->Match( + const bool match = regexp.getRE2()->Match( {str_data, str_size}, start_pos, end_pos, @@ -325,17 +325,17 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle)); String required_substring; bool is_trivial; bool required_substring_is_prefix; /// for `anchored` execution of the regexp. - regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); + regexp.getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); if (required_substring.empty()) { - if (!regexp->getRE2()) /// An empty regexp. Always matches. + if (!regexp.getRE2()) /// An empty regexp. Always matches. { if (haystack_size) memset(res.data(), !negate, haystack_size * sizeof(res[0])); @@ -345,7 +345,7 @@ struct MatchImpl size_t offset = 0; for (size_t i = 0; i < haystack_size; ++i) { - const bool match = regexp->getRE2()->Match( + const bool match = regexp.getRE2()->Match( {reinterpret_cast(&haystack[offset]), N}, 0, N, @@ -403,7 +403,7 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (reinterpret_cast(pos) - str_data) : 0; const size_t end_pos = N; - const bool match = regexp->getRE2()->Match( + const bool match = regexp.getRE2()->Match( {str_data, N}, start_pos, end_pos, @@ -454,6 +454,9 @@ struct MatchImpl size_t prev_haystack_offset = 0; size_t prev_needle_offset = 0; + Regexps::LocalCacheTable cache; + Regexps::RegexpPtr regexp; + for (size_t i = 0; i < haystack_size; ++i) { const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; @@ -479,7 +482,7 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + cache.getOrSet(needle, regexp); regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); @@ -565,6 +568,9 @@ struct MatchImpl size_t prev_haystack_offset = 0; size_t prev_needle_offset = 0; + Regexps::LocalCacheTable cache; + Regexps::RegexpPtr regexp; + for (size_t i = 0; i < haystack_size; ++i) { const auto * const cur_haystack_data = &haystack[prev_haystack_offset]; @@ -590,7 +596,7 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + cache.getOrSet(needle, regexp); regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index be3ce6cdeee..30afccbbac2 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -39,38 +38,72 @@ namespace ErrorCodes namespace Regexps { using Regexp = OptimizedRegularExpressionSingleThreaded; -using Cache = LRUCache; -using RegexpPtr = Cache::MappedPtr; +using RegexpPtr = std::shared_ptr; -template -inline int buildRe2Flags() +template +inline Regexp createRegexp(const std::string & pattern) { int flags = OptimizedRegularExpression::RE_DOT_NL; if constexpr (no_capture) flags |= OptimizedRegularExpression::RE_NO_CAPTURE; if constexpr (case_insensitive) flags |= OptimizedRegularExpression::RE_CASELESS; - return flags; + + if constexpr (like) + return {likePatternToRegexp(pattern), flags}; + else + return {pattern, flags}; } -/// Probes the cache of known compiled regexps for the given string pattern and returns a compiled regexp if -/// found. Otherwise, a new cache entry is created. -template -inline RegexpPtr get(const String & pattern) +/// Caches compiled re2 objects for given string patterns. Intended to support the common situation of a small set of patterns which are +/// evaluated over and over within the same query. In these situations, usage of the cache will save unnecessary pattern re-compilation. +/// However, we must be careful that caching does not add too much static overhead to overall pattern evaluation. Therefore, the cache is +/// intentionally very lightweight: a) no thread-safety/mutexes, b) small & fixed capacity, c) no collision list, d) but also no open +/// addressing, instead collisions simply replace the existing element. +class LocalCacheTable { - static Cache known_regexps(42'000); +public: + using RegexpPtr = std::shared_ptr; - auto [regexp_ptr, _] = known_regexps.getOrSet(pattern, [&pattern]() + LocalCacheTable() + : known_regexps(max_regexp_cache_size, {"", nullptr}) { - const int flags = buildRe2Flags(); - ProfileEvents::increment(ProfileEvents::RegexpCreated); - if constexpr (like) - return std::make_shared(likePatternToRegexp(pattern), flags); + } + + template + void getOrSet(const String & pattern, RegexpPtr & regexp) + { + StringAndRegexp & bucket = known_regexps[hasher(pattern) % max_regexp_cache_size]; + + if (likely(bucket.regexp != nullptr)) + { + if (pattern == bucket.pattern) + regexp = bucket.regexp; + else + { + regexp = std::make_shared(createRegexp(pattern)); + bucket = {pattern, regexp}; + } + } else - return std::make_shared(pattern, flags); - }); - return regexp_ptr; -} + { + regexp = std::make_shared(createRegexp(pattern)); + bucket = {pattern, regexp}; + } + } + +private: + std::hash hasher; + struct StringAndRegexp + { + std::string pattern; + RegexpPtr regexp; + }; + using CacheTable = std::vector; + CacheTable known_regexps; + + constexpr static size_t max_regexp_cache_size = 100; // collision probability +}; } diff --git a/src/Functions/countMatches.h b/src/Functions/countMatches.h index 1d43b66d867..397515c8bba 100644 --- a/src/Functions/countMatches.h +++ b/src/Functions/countMatches.h @@ -55,7 +55,7 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); - Regexps::RegexpPtr re = Regexps::get(column_pattern->getValue()); + const Regexps::Regexp re = Regexps::createRegexp(column_pattern->getValue()); OptimizedRegularExpression::MatchVec matches; const IColumn * column_haystack = arguments[0].column.get(); @@ -95,7 +95,7 @@ public: throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()"); } - static uint64_t countMatches(StringRef src, Regexps::RegexpPtr & re, OptimizedRegularExpression::MatchVec & matches) + static uint64_t countMatches(StringRef src, const Regexps::Regexp & re, OptimizedRegularExpression::MatchVec & matches) { /// Only one match is required, no need to copy more. static const unsigned matches_limit = 1; @@ -108,7 +108,7 @@ public: { if (pos >= end) break; - if (!re->match(pos, end - pos, matches, matches_limit)) + if (!re.match(pos, end - pos, matches, matches_limit)) break; /// Progress should be made, but with empty match the progress will not be done. /// Also note that simply check is pattern empty is not enough, diff --git a/src/Functions/extract.cpp b/src/Functions/extract.cpp index 5b138d19747..f5917015e27 100644 --- a/src/Functions/extract.cpp +++ b/src/Functions/extract.cpp @@ -21,9 +21,9 @@ struct ExtractImpl res_data.reserve(data.size() / 5); res_offsets.resize(offsets.size()); - const auto & regexp = Regexps::get(pattern); + const Regexps::Regexp regexp = Regexps::createRegexp(pattern); - unsigned capture = regexp->getNumberOfSubpatterns() > 0 ? 1 : 0; + unsigned capture = regexp.getNumberOfSubpatterns() > 0 ? 1 : 0; OptimizedRegularExpression::MatchVec matches; matches.reserve(capture + 1); size_t prev_offset = 0; @@ -34,7 +34,7 @@ struct ExtractImpl size_t cur_offset = offsets[i]; unsigned count - = regexp->match(reinterpret_cast(&data[prev_offset]), cur_offset - prev_offset - 1, matches, capture + 1); + = regexp.match(reinterpret_cast(&data[prev_offset]), cur_offset - prev_offset - 1, matches, capture + 1); if (count > capture && matches[capture].offset != std::string::npos) { const auto & match = matches[capture]; diff --git a/src/Functions/extractAllGroups.h b/src/Functions/extractAllGroups.h index e6d31e00616..1a40afbbe8e 100644 --- a/src/Functions/extractAllGroups.h +++ b/src/Functions/extractAllGroups.h @@ -95,8 +95,8 @@ public: throw Exception("Length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS); using StringPiece = typename Regexps::Regexp::StringPieceType; - auto holder = Regexps::get(needle); - const auto & regexp = holder->getRE2(); + const Regexps::Regexp holder = Regexps::createRegexp(needle); + const auto & regexp = holder.getRE2(); if (!regexp) throw Exception("There are no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS); diff --git a/src/Functions/extractGroups.cpp b/src/Functions/extractGroups.cpp index c5b958ec345..940e76df1c0 100644 --- a/src/Functions/extractGroups.cpp +++ b/src/Functions/extractGroups.cpp @@ -63,8 +63,8 @@ public: if (needle.empty()) throw Exception(getName() + " length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS); - auto regexp = Regexps::get(needle); - const auto & re2 = regexp->getRE2(); + const Regexps::Regexp regexp = Regexps::createRegexp(needle); + const auto & re2 = regexp.getRE2(); if (!re2) throw Exception("There are no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS); diff --git a/tests/performance/like_and_match_pattern_caching.xml b/tests/performance/like_and_match_pattern_caching.xml new file mode 100644 index 00000000000..c0a8ec9442e --- /dev/null +++ b/tests/performance/like_and_match_pattern_caching.xml @@ -0,0 +1,62 @@ + + + + + + numbers + + numbers_mt(2000000) + + + + needle_like + + simple patterns, all unique + '%' || toString(number) || '_' + simple patterns, low distinctness (10 patterns) + '%' || toString(number % 10) || '_' + + + + needle_match + + + '.*' || toString(number) || '.' + + '.*' || toString(number % 10) || '.' + + '([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?([^ @]+)@([^ @]+)([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])' || toString(number) + + '([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?([^ @]+)@([^ @]+)([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])' || toString(number % 10) + + + + + + + + + select toString(number) as haystack, like(haystack, '%x_') + from(select * from {numbers}) + + + + select toString(number) as haystack, match(haystack, '.*x.') + from(select * from {numbers}) + + + + + + select toString(number) as haystack, {needle_like} as needle, like(haystack, needle) + from (select * from {numbers}); + + + + select toString(number) as haystack, {needle_match} as needle, match(haystack, needle) + from (select * from {numbers}); + + + From 9d04305a5a8614d61437c0ca1b598fde4014dca1 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 30 May 2022 23:00:28 +0200 Subject: [PATCH 18/69] Update Settings.h --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9111e1d80da..df8bbf0a307 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -371,7 +371,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, memory_profiler_step, (4 * 1024 * 1024), "Whenever query memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stack trace. Zero means disabled memory profiler. Values lower than a few megabytes will slow down query processing.", 0) \ M(Float, memory_profiler_sample_probability, 0., "Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless to the size of the allocation. Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine grained sampling.", 0) \ \ - M(UInt64, memory_usage_overcommit_max_wait_microseconds, 5'000'000, "Maximum time thread will wait for memory to be freed in the case of memory overcommit on user level. If timeout is reached and memory is not freed, exception is thrown.", 0) \ + M(UInt64, memory_usage_overcommit_max_wait_microseconds, 5'000'000, "Maximum time thread will wait for memory to be freed in the case of memory overcommit. If timeout is reached and memory is not freed, exception is thrown.", 0) \ \ M(UInt64, max_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for a query. Zero means unlimited.", 0) \ M(UInt64, max_network_bytes, 0, "The maximum number of bytes (compressed) to receive or transmit over the network for execution of the query.", 0) \ From a2857491c42d560e882debe6aa2f5563079a7841 Mon Sep 17 00:00:00 2001 From: yaqi-zhao Date: Fri, 27 May 2022 13:54:11 -0400 Subject: [PATCH 19/69] add avx512 support for mergetreereader --- src/Common/TargetSpecific.cpp | 3 ++ src/Common/TargetSpecific.h | 1 + .../MergeTree/MergeTreeRangeReader.cpp | 29 +++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/src/Common/TargetSpecific.cpp b/src/Common/TargetSpecific.cpp index 43319eff44b..369c21490d4 100644 --- a/src/Common/TargetSpecific.cpp +++ b/src/Common/TargetSpecific.cpp @@ -16,6 +16,8 @@ UInt32 getSupportedArchs() result |= static_cast(TargetArch::AVX2); if (Cpu::CpuFlagsCache::have_AVX512F) result |= static_cast(TargetArch::AVX512F); + if (Cpu::CpuFlagsCache::have_AVX512BW) + result |= static_cast(TargetArch::AVX512BW); return result; } @@ -34,6 +36,7 @@ String toString(TargetArch arch) case TargetArch::AVX: return "avx"; case TargetArch::AVX2: return "avx2"; case TargetArch::AVX512F: return "avx512f"; + case TargetArch::AVX512BW: return "avx512bw"; } __builtin_unreachable(); diff --git a/src/Common/TargetSpecific.h b/src/Common/TargetSpecific.h index d7fa55fbb08..522dd6e43c3 100644 --- a/src/Common/TargetSpecific.h +++ b/src/Common/TargetSpecific.h @@ -80,6 +80,7 @@ enum class TargetArch : UInt32 AVX = (1 << 1), AVX2 = (1 << 2), AVX512F = (1 << 3), + AVX512BW = (1 << 4), }; /// Runtime detection. diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index d8dba458203..84a1ab91906 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -10,6 +11,7 @@ #include #endif + namespace DB { namespace ErrorCodes @@ -449,6 +451,33 @@ size_t MergeTreeRangeReader::ReadResult::numZerosInTail(const UInt8 * begin, con { size_t count = 0; +#if defined(__AVX512F__) && defined(__AVX512BW__) /// check if avx512 instructions are compiled + if (isArchSupported(TargetArch::AVX512BW)) { + /// check if cpu support avx512 dynamically, haveAVX512BW contains check of haveAVX512F + const __m512i zero64 = _mm512_setzero_epi32(); + while (end - begin >= 64) + { + end -= 64; + const auto * pos = end; + UInt64 val = static_cast(_mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast(pos)), zero64, _MM_CMPINT_EQ)); + val = ~val; + if (val == 0) + { + count += 64; + } else + { + count += __builtin_clzll(val); + return count; + } + } + while (end > begin && *(--end) == 0) + { + ++count; + } + return count; + } +#endif + #if defined(__SSE2__) && defined(__POPCNT__) const __m128i zero16 = _mm_setzero_si128(); while (end - begin >= 64) From ba4cdd43bd19e79d2d94d491968a93755a20c9e4 Mon Sep 17 00:00:00 2001 From: xlwh <1079039435@qq.com> Date: Tue, 31 May 2022 14:37:30 +0800 Subject: [PATCH 20/69] Cleanup unused file --- src/Storages/ColumnCodec.h | 11 ----------- src/Storages/ColumnsDescription.h | 1 - 2 files changed, 12 deletions(-) delete mode 100644 src/Storages/ColumnCodec.h diff --git a/src/Storages/ColumnCodec.h b/src/Storages/ColumnCodec.h deleted file mode 100644 index 63a604c8198..00000000000 --- a/src/Storages/ColumnCodec.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace DB -{ - using ColumnCodecs = std::unordered_map; -} diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index c81ccb5d217..209dee885f4 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include From 582be423298958b72bfc7c88da90bc6a58ce9dc0 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 31 May 2022 08:31:00 +0000 Subject: [PATCH 21/69] Wait for leader election --- tests/integration/test_keeper_force_recovery/test.py | 12 +++++++++++- .../test_keeper_force_recovery_single_node/test.py | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_keeper_force_recovery/test.py b/tests/integration/test_keeper_force_recovery/test.py index e4f42ba21f6..5f1b7d1e4e4 100644 --- a/tests/integration/test_keeper_force_recovery/test.py +++ b/tests/integration/test_keeper_force_recovery/test.py @@ -132,7 +132,17 @@ def test_cluster_recovery(started_cluster): nodes[0].stop_clickhouse() - add_data(node_zks[1], "/test_force_recovery_extra", "somedataextra") + # we potentially killed the leader node so we give time for election + for _ in range(100): + try: + node_zks[1] = get_fake_zk(nodes[1].name, timeout=30.0) + add_data(node_zks[1], "/test_force_recovery_extra", "somedataextra") + break + except Exception as ex: + time.sleep(0.5) + print(f"Retrying create on {nodes[1].name}, exception {ex}") + else: + raise Exception(f"Failed creating a node on {nodes[1].name}") for node_zk in node_zks[2:CLUSTER_SIZE]: wait_and_assert_data(node_zk, "/test_force_recovery_extra", "somedataextra") diff --git a/tests/integration/test_keeper_force_recovery_single_node/test.py b/tests/integration/test_keeper_force_recovery_single_node/test.py index 1e58a25221e..0a554e33119 100644 --- a/tests/integration/test_keeper_force_recovery_single_node/test.py +++ b/tests/integration/test_keeper_force_recovery_single_node/test.py @@ -121,7 +121,17 @@ def test_cluster_recovery(started_cluster): nodes[0].stop_clickhouse() - add_data(node_zks[1], "/test_force_recovery_extra", "somedataextra") + # we potentially killed the leader node so we give time for election + for _ in range(100): + try: + node_zks[1] = get_fake_zk(nodes[1].name, timeout=30.0) + add_data(node_zks[1], "/test_force_recovery_extra", "somedataextra") + break + except Exception as ex: + time.sleep(0.5) + print(f"Retrying create on {nodes[1].name}, exception {ex}") + else: + raise Exception(f"Failed creating a node on {nodes[1].name}") for node_zk in node_zks[2:CLUSTER_SIZE]: wait_and_assert_data(node_zk, "/test_force_recovery_extra", "somedataextra") From 69cd3a2b1060c1a2df4f4ab836ce4203b5c9a8ac Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 31 May 2022 14:20:31 +0200 Subject: [PATCH 22/69] Fix --- src/Databases/MySQL/DatabaseMySQL.cpp | 13 +++++++++++-- .../integration/test_mysql_database_engine/test.py | 4 ++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/Databases/MySQL/DatabaseMySQL.cpp b/src/Databases/MySQL/DatabaseMySQL.cpp index 446518be5cd..58be682bd73 100644 --- a/src/Databases/MySQL/DatabaseMySQL.cpp +++ b/src/Databases/MySQL/DatabaseMySQL.cpp @@ -26,6 +26,7 @@ # include # include # include +# include namespace fs = std::filesystem; @@ -148,8 +149,16 @@ ASTPtr DatabaseMySQL::getCreateTableQueryImpl(const String & table_name, Context auto storage_engine_arguments = ast_storage->engine->arguments; /// Add table_name to engine arguments - auto mysql_table_name = std::make_shared(table_name); - storage_engine_arguments->children.insert(storage_engine_arguments->children.begin() + 2, mysql_table_name); + if (typeid_cast(storage_engine_arguments->children[0].get())) + { + storage_engine_arguments->children.push_back( + makeASTFunction("equals", std::make_shared("table"), std::make_shared(table_name))); + } + else + { + auto mysql_table_name = std::make_shared(table_name); + storage_engine_arguments->children.insert(storage_engine_arguments->children.begin() + 2, mysql_table_name); + } /// Unset settings std::erase_if(storage_children, [&](const ASTPtr & element) { return element.get() == ast_storage->settings; }); diff --git a/tests/integration/test_mysql_database_engine/test.py b/tests/integration/test_mysql_database_engine/test.py index 3e0a1a549d1..500d9176f4b 100644 --- a/tests/integration/test_mysql_database_engine/test.py +++ b/tests/integration/test_mysql_database_engine/test.py @@ -930,6 +930,10 @@ def test_predefined_connection_configuration(started_cluster): == "100" ) + result = clickhouse_node.query("show create table test_database.test_table") + print(result) + assert(result.strip() == "CREATE TABLE test_database.test_table\\n(\\n `id` Int32\\n)\\nENGINE = MySQL(mysql1, table = \\'test_table\\')") + clickhouse_node.query("DROP DATABASE test_database") clickhouse_node.query_and_get_error( "CREATE DATABASE test_database ENGINE = MySQL(mysql2)" From c2087b3145d53d6569578c2dff35f90981f30571 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 31 May 2022 14:38:11 +0200 Subject: [PATCH 23/69] Fix --- src/Storages/RabbitMQ/StorageRabbitMQ.cpp | 3 ++- tests/integration/test_storage_rabbitmq/test.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index 678416ddd42..6d0a3f4ab6c 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -1154,7 +1154,8 @@ void registerStorageRabbitMQ(StorageFactory & factory) if (!with_named_collection && !args.storage_def->settings) throw Exception(ErrorCodes::BAD_ARGUMENTS, "RabbitMQ engine must have settings"); - rabbitmq_settings->loadFromQuery(*args.storage_def); + if (args.storage_def->settings) + rabbitmq_settings->loadFromQuery(*args.storage_def); if (!rabbitmq_settings->rabbitmq_host_port.changed && !rabbitmq_settings->rabbitmq_address.changed) diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index b18d9c26d88..c1bd136126f 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -27,6 +27,7 @@ instance = cluster.add_instance( ], user_configs=["configs/users.xml"], with_rabbitmq=True, + stay_alive=True, ) @@ -2732,6 +2733,16 @@ def test_rabbitmq_predefined_configuration(rabbitmq_cluster): ) if result == "1\t2\n": break + instance.restart_clickhouse() + channel.basic_publish( + exchange="named", routing_key="", body=json.dumps({"key": 1, "value": 2}) + ) + while True: + result = instance.query( + "SELECT * FROM test.rabbitmq ORDER BY key", ignore_error=True + ) + if result == "1\t2\n": + break if __name__ == "__main__": From 4b427336e3654f0880cb510feb544f01c7b33c23 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Tue, 31 May 2022 09:37:34 -0400 Subject: [PATCH 24/69] tests with overridden and appended parameters --- tests/integration/test_storage_dict/configs/conf.xml | 9 +++++++++ tests/integration/test_storage_dict/test.py | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/tests/integration/test_storage_dict/configs/conf.xml b/tests/integration/test_storage_dict/configs/conf.xml index c2ecb518884..e37ca358e63 100644 --- a/tests/integration/test_storage_dict/configs/conf.xml +++ b/tests/integration/test_storage_dict/configs/conf.xml @@ -12,5 +12,14 @@ /test_dict TabSeparated + + http://nginx:80/ + /test_dict + + + http://nginx:80/ + /test_dict + CSV + diff --git a/tests/integration/test_storage_dict/test.py b/tests/integration/test_storage_dict/test.py index a5270a42114..1ed974f267d 100644 --- a/tests/integration/test_storage_dict/test.py +++ b/tests/integration/test_storage_dict/test.py @@ -32,3 +32,15 @@ def test_storage_dict(cluster): ) result = node1.query(f"select * from dict") assert result.strip() == "foo\tbar" + + node1.query( + f"create dictionary dict1 (k String, v String) primary key k source(http(name urldict1 format TabSeparated)) layout(complex_key_hashed()) lifetime(min 0 max 100)" + ) + result = node1.query(f"select * from dict1") + assert result.strip() == "foo\tbar" + + node1.query( + f"create dictionary dict2 (k String, v String) primary key k source(http(name urldict2 format TabSeparated)) layout(complex_key_hashed()) lifetime(min 0 max 100)" + ) + result = node1.query(f"select * from dict2") + assert result.strip() == "foo\tbar" From d1a4550b4fac51579fbe8f1aa0106e7a7a98be62 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 31 May 2022 17:23:41 +0200 Subject: [PATCH 25/69] Fix create or drop of sql user defined functions in readonly mode --- src/Access/ContextAccess.cpp | 3 ++- .../0_stateless/02315_readonly_create_function.reference | 1 + .../queries/0_stateless/02315_readonly_create_function.sh | 7 +++++++ 3 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02315_readonly_create_function.reference create mode 100755 tests/queries/0_stateless/02315_readonly_create_function.sh diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp index 46fdba9d65e..89cdcc76234 100644 --- a/src/Access/ContextAccess.cpp +++ b/src/Access/ContextAccess.cpp @@ -445,10 +445,11 @@ bool ContextAccess::checkAccessImplHelper(AccessFlags flags, const Args &... arg const AccessFlags dictionary_ddl = AccessType::CREATE_DICTIONARY | AccessType::DROP_DICTIONARY; const AccessFlags function_ddl = AccessType::CREATE_FUNCTION | AccessType::DROP_FUNCTION; const AccessFlags table_and_dictionary_ddl = table_ddl | dictionary_ddl; + const AccessFlags table_and_dictionary_and_function_ddl = table_ddl | dictionary_ddl | function_ddl; const AccessFlags write_table_access = AccessType::INSERT | AccessType::OPTIMIZE; const AccessFlags write_dcl_access = AccessType::ACCESS_MANAGEMENT - AccessType::SHOW_ACCESS; - const AccessFlags not_readonly_flags = write_table_access | table_and_dictionary_ddl | write_dcl_access | AccessType::SYSTEM | AccessType::KILL_QUERY; + const AccessFlags not_readonly_flags = write_table_access | table_and_dictionary_and_function_ddl | write_dcl_access | AccessType::SYSTEM | AccessType::KILL_QUERY; const AccessFlags not_readonly_1_flags = AccessType::CREATE_TEMPORARY_TABLE; const AccessFlags ddl_flags = table_ddl | dictionary_ddl | function_ddl; diff --git a/tests/queries/0_stateless/02315_readonly_create_function.reference b/tests/queries/0_stateless/02315_readonly_create_function.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02315_readonly_create_function.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02315_readonly_create_function.sh b/tests/queries/0_stateless/02315_readonly_create_function.sh new file mode 100755 index 00000000000..03a4feb3038 --- /dev/null +++ b/tests/queries/0_stateless/02315_readonly_create_function.sh @@ -0,0 +1,7 @@ +CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=none + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --readonly 1 --query "CREATE FUNCTION test_function AS (x) -> x + 1;" 2>&1 | grep -c -F 'Code: 164' From 66f43b9ad340a3e1caadea0670222c8b145492ea Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 31 May 2022 18:46:33 +0200 Subject: [PATCH 26/69] Fix executable user default functions execution with Nullable arguments --- .../UserDefinedExecutableFunctionFactory.cpp | 2 +- .../functions/test_function_config.xml | 22 +++++++++++++++++++ .../test.py | 12 ++++++++++ .../user_scripts/input_nullable.py | 12 ++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) create mode 100755 tests/integration/test_executable_user_defined_function/user_scripts/input_nullable.py diff --git a/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp b/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp index 5b5c7911735..b67e9c16ed5 100644 --- a/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp +++ b/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp @@ -43,7 +43,7 @@ public: size_t getNumberOfArguments() const override { return executable_function->getConfiguration().arguments.size(); } bool useDefaultImplementationForConstants() const override { return true; } - bool useDefaultImplementationForNulls() const override { return true; } + bool useDefaultImplementationForNulls() const override { return false; } bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const override { return false; } diff --git a/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml b/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml index b2b7db83fbc..dce7ab2eacd 100644 --- a/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml +++ b/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml @@ -289,4 +289,26 @@ input_sum_json_named_args.py + + executable + test_function_nullable_python + String + + Nullable(UInt64) + + TabSeparated + input_nullable.py + + + + executable + test_function_nullable_pool_python + String + + Nullable(UInt64) + + TabSeparated + input_nullable.py + + diff --git a/tests/integration/test_executable_user_defined_function/test.py b/tests/integration/test_executable_user_defined_function/test.py index 10993e9c5dd..20beded2284 100644 --- a/tests/integration/test_executable_user_defined_function/test.py +++ b/tests/integration/test_executable_user_defined_function/test.py @@ -228,3 +228,15 @@ def test_executable_function_sum_json_python(started_cluster): ) node.query("DROP TABLE test_table;") + +def test_executable_function_input_nullable_python(started_cluster): + skip_test_msan(node) + + node.query("CREATE TABLE test_table_nullable (value Nullable(UInt64)) ENGINE=TinyLog;") + node.query("INSERT INTO test_table_nullable VALUES (0), (NULL), (2);") + + assert(node.query("SELECT test_function_nullable_python(1), test_function_nullable_python(NULL)") == "Key 1\tKey Nullable\n") + assert(node.query("SELECT test_function_nullable_python(value) FROM test_table_nullable;") == "Key 0\nKey Nullable\nKey 2\n") + + assert(node.query("SELECT test_function_nullable_pool_python(1), test_function_nullable_pool_python(NULL)") == "Key 1\tKey Nullable\n") + assert(node.query("SELECT test_function_nullable_pool_python(value) FROM test_table_nullable;") == "Key 0\nKey Nullable\nKey 2\n") diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_nullable.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_nullable.py new file mode 100755 index 00000000000..f2f00a335bd --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_nullable.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == "__main__": + for line in sys.stdin: + if (line == "\\N\n"): + print("Key Nullable", end="\n") + else: + print("Key " + line, end="") + + sys.stdout.flush() From fdd190d010b7d9199b895f5ad3d6e600ebf4447e Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 31 May 2022 18:40:14 +0000 Subject: [PATCH 27/69] Fix typo in 00814_replicated_minimalistic_part_header_zookeeper.sql --- .../00814_replicated_minimalistic_part_header_zookeeper.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sql b/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sql index 5d03823dde3..f1fb4a8c4dc 100644 --- a/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sql +++ b/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sql @@ -44,7 +44,7 @@ SELECT name FROM system.parts WHERE active AND database = currentDatabase() AND SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/s1/replicas/1r1/parts'; SELECT '*** replica 2 ***'; SELECT name FROM system.parts WHERE active AND database = currentDatabase() AND table = 'part_header_r2'; -SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/s1/replicas/1r1/parts'; +SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/s1/replicas/2r1/parts'; SELECT '*** Test ALTER ***'; ALTER TABLE part_header_r1 MODIFY COLUMN y String; From fbfab8558abd619e4535238330cc7ea1137b6f0d Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 31 May 2022 18:52:18 +0000 Subject: [PATCH 28/69] Add retries to 00814_replicated_minimalistic_part_header_zookeeper --- ...ted_minimalistic_part_header_zookeeper.sh} | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) rename tests/queries/0_stateless/{00814_replicated_minimalistic_part_header_zookeeper.sql => 00814_replicated_minimalistic_part_header_zookeeper.sh} (78%) mode change 100644 => 100755 diff --git a/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sql b/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh old mode 100644 new mode 100755 similarity index 78% rename from tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sql rename to tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh index f1fb4a8c4dc..5917ed29533 --- a/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sql +++ b/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh @@ -1,4 +1,13 @@ --- Tags: replica +#!/usr/bin/env bash +# Tags: replica + +set -e + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -nm -q " DROP TABLE IF EXISTS part_header_r1; DROP TABLE IF EXISTS part_header_r2; @@ -36,7 +45,18 @@ SELECT _part, x FROM part_header_r1 ORDER BY x; SELECT '*** replica 2 ***'; SELECT _part, x FROM part_header_r2 ORDER BY x; -SELECT sleep(3) FORMAT Null; +" + +elapsed=1 +until [ $elapsed -eq 5 ]; +do + sleep $(( elapsed++ )) + count1=$($CLICKHOUSE_CLIENT --query="SELECT count(name) FROM system.zookeeper WHERE path = '/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/s1/replicas/1r1/parts'") + count2=$($CLICKHOUSE_CLIENT --query="SELECT count(name) FROM system.zookeeper WHERE path = '/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/s1/replicas/2r1/parts'") + [[ $count1 == 1 && $count2 == 1 ]] && break +done + +$CLICKHOUSE_CLIENT -nm -q " SELECT '*** Test part removal ***'; SELECT '*** replica 1 ***'; @@ -63,3 +83,5 @@ SELECT x, length(y) FROM part_header_r2 ORDER BY x; DROP TABLE part_header_r1; DROP TABLE part_header_r2; + +" From cf82df6ce782301db97f8424299cea3124c3a22b Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 31 May 2022 19:26:44 +0000 Subject: [PATCH 29/69] Use CLICKHOUSE_TEST_ZOOKEEPER_PREFIX in 00814_replicated_minimalistic_part_header --- ..._replicated_minimalistic_part_header_zookeeper.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh b/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh index 5917ed29533..6f609065c01 100755 --- a/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh +++ b/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh @@ -15,13 +15,13 @@ DROP TABLE IF EXISTS part_header_r2; SET replication_alter_partitions_sync = 2; CREATE TABLE part_header_r1(x UInt32, y UInt32) - ENGINE ReplicatedMergeTree('/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/{shard}', '1{replica}') ORDER BY x + ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_00814/part_header/{shard}', '1{replica}') ORDER BY x SETTINGS use_minimalistic_part_header_in_zookeeper = 0, old_parts_lifetime = 1, cleanup_delay_period = 0, cleanup_delay_period_random_add = 0; CREATE TABLE part_header_r2(x UInt32, y UInt32) - ENGINE ReplicatedMergeTree('/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/{shard}', '2{replica}') ORDER BY x + ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_00814/part_header/{shard}', '2{replica}') ORDER BY x SETTINGS use_minimalistic_part_header_in_zookeeper = 1, old_parts_lifetime = 1, cleanup_delay_period = 0, @@ -51,8 +51,8 @@ elapsed=1 until [ $elapsed -eq 5 ]; do sleep $(( elapsed++ )) - count1=$($CLICKHOUSE_CLIENT --query="SELECT count(name) FROM system.zookeeper WHERE path = '/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/s1/replicas/1r1/parts'") - count2=$($CLICKHOUSE_CLIENT --query="SELECT count(name) FROM system.zookeeper WHERE path = '/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/s1/replicas/2r1/parts'") + count1=$($CLICKHOUSE_CLIENT --query="SELECT count(name) FROM system.zookeeper WHERE path = '/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_00814/part_header/s1/replicas/1r1/parts'") + count2=$($CLICKHOUSE_CLIENT --query="SELECT count(name) FROM system.zookeeper WHERE path = '/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_00814/part_header/s1/replicas/2r1/parts'") [[ $count1 == 1 && $count2 == 1 ]] && break done @@ -61,10 +61,10 @@ $CLICKHOUSE_CLIENT -nm -q " SELECT '*** Test part removal ***'; SELECT '*** replica 1 ***'; SELECT name FROM system.parts WHERE active AND database = currentDatabase() AND table = 'part_header_r1'; -SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/s1/replicas/1r1/parts'; +SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_00814/part_header/s1/replicas/1r1/parts'; SELECT '*** replica 2 ***'; SELECT name FROM system.parts WHERE active AND database = currentDatabase() AND table = 'part_header_r2'; -SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/'||currentDatabase()||'/test_00814/part_header/s1/replicas/2r1/parts'; +SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_00814/part_header/s1/replicas/2r1/parts'; SELECT '*** Test ALTER ***'; ALTER TABLE part_header_r1 MODIFY COLUMN y String; From 26609a18751b38f0f866e3a3ad3f5309cc951136 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Tue, 31 May 2022 21:41:10 +0200 Subject: [PATCH 30/69] Style fixes --- src/Common/TargetSpecific.cpp | 2 +- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Common/TargetSpecific.cpp b/src/Common/TargetSpecific.cpp index 369c21490d4..c52c8c2bcf0 100644 --- a/src/Common/TargetSpecific.cpp +++ b/src/Common/TargetSpecific.cpp @@ -17,7 +17,7 @@ UInt32 getSupportedArchs() if (Cpu::CpuFlagsCache::have_AVX512F) result |= static_cast(TargetArch::AVX512F); if (Cpu::CpuFlagsCache::have_AVX512BW) - result |= static_cast(TargetArch::AVX512BW); + result |= static_cast(TargetArch::AVX512BW); return result; } diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 84a1ab91906..56bd62ee271 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -452,7 +452,8 @@ size_t MergeTreeRangeReader::ReadResult::numZerosInTail(const UInt8 * begin, con size_t count = 0; #if defined(__AVX512F__) && defined(__AVX512BW__) /// check if avx512 instructions are compiled - if (isArchSupported(TargetArch::AVX512BW)) { + if (isArchSupported(TargetArch::AVX512BW)) + { /// check if cpu support avx512 dynamically, haveAVX512BW contains check of haveAVX512F const __m512i zero64 = _mm512_setzero_epi32(); while (end - begin >= 64) @@ -461,10 +462,9 @@ size_t MergeTreeRangeReader::ReadResult::numZerosInTail(const UInt8 * begin, con const auto * pos = end; UInt64 val = static_cast(_mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast(pos)), zero64, _MM_CMPINT_EQ)); val = ~val; - if (val == 0) - { + if (val == 0) count += 64; - } else + else { count += __builtin_clzll(val); return count; From 31e1e678366956ce6585a611baeb3dc53301641a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 1 Jun 2022 06:25:32 +0000 Subject: [PATCH 31/69] Initialize ParallelReadBuffer after construction --- src/IO/ParallelReadBuffer.cpp | 13 +++++++++++++ src/IO/ParallelReadBuffer.h | 5 +++++ src/Storages/StorageS3.cpp | 5 ++++- src/Storages/StorageURL.cpp | 10 +++++----- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/IO/ParallelReadBuffer.cpp b/src/IO/ParallelReadBuffer.cpp index 512f1c856b7..8d776a115a7 100644 --- a/src/IO/ParallelReadBuffer.cpp +++ b/src/IO/ParallelReadBuffer.cpp @@ -48,8 +48,15 @@ ParallelReadBuffer::ParallelReadBuffer( , max_working_readers(max_working_readers_) , schedule(std::move(schedule_)) , reader_factory(std::move(reader_factory_)) +{} + +void ParallelReadBuffer::initialize() { + if (initialized) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ParallelReadBuffer is initialized twice"); + addReaders(); + initialized = true; } bool ParallelReadBuffer::addReaderToPool() @@ -76,6 +83,9 @@ void ParallelReadBuffer::addReaders() off_t ParallelReadBuffer::seek(off_t offset, int whence) { + if (!initialized) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ParallelReadBuffer is not initialized"); + if (whence != SEEK_SET) throw Exception("Only SEEK_SET mode is allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE); @@ -172,6 +182,9 @@ void ParallelReadBuffer::handleEmergencyStop() bool ParallelReadBuffer::nextImpl() { + if (!initialized) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ParallelReadBuffer is not initialized"); + if (all_completed) return false; diff --git a/src/IO/ParallelReadBuffer.h b/src/IO/ParallelReadBuffer.h index 83b978848f8..0568a5f0298 100644 --- a/src/IO/ParallelReadBuffer.h +++ b/src/IO/ParallelReadBuffer.h @@ -40,6 +40,9 @@ public: explicit ParallelReadBuffer(std::unique_ptr reader_factory_, CallbackRunner schedule_, size_t max_working_readers); + // some readers can throw exception during constructor call so we can't initialize ParallelReadBuffer there + void initialize(); + ~ParallelReadBuffer() override { finishAndWait(); } off_t seek(off_t off, int whence) override; @@ -96,6 +99,8 @@ private: off_t current_position{0}; bool all_completed{false}; + + bool initialized{false}; }; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 393ea0e24ff..39408e8ef36 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -479,7 +479,10 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k LOG_TRACE( log, "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", download_thread_num, object_size, download_buffer_size); - return std::make_unique(std::move(factory), threadPoolCallbackRunner(IOThreadPool::get()), download_thread_num); + auto parallel_read_buffer + = std::make_unique(std::move(factory), threadPoolCallbackRunner(IOThreadPool::get()), download_thread_num); + parallel_read_buffer->initialize(); + return parallel_read_buffer; } String StorageS3Source::getName() const diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 062241797e0..fdc6adb8c15 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -345,12 +345,12 @@ namespace /* use_external_buffer */ false, /* skip_url_not_found_error */ skip_url_not_found_error); + auto parallel_read_buffer = std::make_unique( + std::move(read_buffer_factory), threadPoolCallbackRunner(IOThreadPool::get()), download_threads); + parallel_read_buffer->initialize(); + return wrapReadBufferWithCompressionMethod( - std::make_unique( - std::move(read_buffer_factory), - threadPoolCallbackRunner(IOThreadPool::get()), - download_threads), - chooseCompressionMethod(request_uri.getPath(), compression_method)); + std::move(parallel_read_buffer), chooseCompressionMethod(request_uri.getPath(), compression_method)); } } catch (const Poco::Exception & e) From 249fe561f4373a3c381a8c8ffc21e66ab476119a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 1 Jun 2022 09:42:57 +0200 Subject: [PATCH 32/69] Fix build with -DENABLE_LIBRARIES=0 / -DENABLE_REPLXX=0 Replxx: When disabled via -DENABLE_LIBRARIES=0 or -DENABLE_REPLXX (the latter was undocumented) the build broke because replxx symbols were used since [0] in header LineReader.h. This header should in theory stay clean of replxx but doesn't for efficiency reasons. This change makes compilation of replxx mandatory. As replxx is quite small, I guess this is okay. (The alternative is to litter the code with ifdefs for non-replxx and a replxx paths.) [0] https://github.com/ClickHouse/ClickHouse/pull/33201 --- base/base/CMakeLists.txt | 5 +---- contrib/replxx-cmake/CMakeLists.txt | 7 ------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/base/base/CMakeLists.txt b/base/base/CMakeLists.txt index adffb91625c..175a4836e64 100644 --- a/base/base/CMakeLists.txt +++ b/base/base/CMakeLists.txt @@ -17,15 +17,12 @@ set (SRCS sleep.cpp terminalColors.cpp errnoToString.cpp + ReplxxLineReader.cpp StringRef.cpp safeExit.cpp throwError.cpp ) -if (ENABLE_REPLXX) - list (APPEND SRCS ReplxxLineReader.cpp) -endif () - if (USE_DEBUG_HELPERS) get_target_property(MAGIC_ENUM_INCLUDE_DIR ch_contrib::magic_enum INTERFACE_INCLUDE_DIRECTORIES) # CMake generator expression will do insane quoting when it encounters special character like quotes, spaces, etc. diff --git a/contrib/replxx-cmake/CMakeLists.txt b/contrib/replxx-cmake/CMakeLists.txt index 8487ad520bc..c7cf6eb7687 100644 --- a/contrib/replxx-cmake/CMakeLists.txt +++ b/contrib/replxx-cmake/CMakeLists.txt @@ -1,10 +1,3 @@ -option (ENABLE_REPLXX "Enable replxx support" ${ENABLE_LIBRARIES}) - -if (NOT ENABLE_REPLXX) - message (STATUS "Not using replxx") - return() -endif() - set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/replxx") set(SRCS From 12871a43e1090f2a48f1070989286857f7183723 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 1 Jun 2022 10:10:06 +0200 Subject: [PATCH 33/69] Cosmetics --- contrib/abseil-cpp-cmake/CMakeLists.txt | 3 -- contrib/amqpcpp-cmake/CMakeLists.txt | 1 + contrib/arrow-cmake/CMakeLists.txt | 2 +- contrib/bzip2-cmake/CMakeLists.txt | 6 +--- contrib/cassandra-cmake/CMakeLists.txt | 1 + contrib/cppkafka-cmake/CMakeLists.txt | 2 +- contrib/fastops-cmake/CMakeLists.txt | 2 +- contrib/fmtlib-cmake/CMakeLists.txt | 32 ++++++++++--------- contrib/h3-cmake/CMakeLists.txt | 34 ++++++++++----------- contrib/hive-metastore-cmake/CMakeLists.txt | 2 +- contrib/libcpuid-cmake/CMakeLists.txt | 2 +- contrib/libgsasl-cmake/CMakeLists.txt | 2 +- contrib/libuv-cmake/CMakeLists.txt | 1 + contrib/minizip-ng-cmake/CMakeLists.txt | 2 +- contrib/nanodbc-cmake/CMakeLists.txt | 4 +-- contrib/thrift-cmake/CMakeLists.txt | 2 +- 16 files changed, 48 insertions(+), 50 deletions(-) diff --git a/contrib/abseil-cpp-cmake/CMakeLists.txt b/contrib/abseil-cpp-cmake/CMakeLists.txt index 4fb02327d17..4c31ecfc599 100644 --- a/contrib/abseil-cpp-cmake/CMakeLists.txt +++ b/contrib/abseil-cpp-cmake/CMakeLists.txt @@ -1,7 +1,4 @@ set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp") -if(NOT EXISTS "${ABSL_ROOT_DIR}/CMakeLists.txt") - message(FATAL_ERROR " submodule third_party/abseil-cpp is missing. To fix try run: \n git submodule update --init --recursive") -endif() set(BUILD_TESTING OFF) set(ABSL_PROPAGATE_CXX_STD ON) add_subdirectory("${ABSL_ROOT_DIR}" "${ClickHouse_BINARY_DIR}/contrib/abseil-cpp") diff --git a/contrib/amqpcpp-cmake/CMakeLists.txt b/contrib/amqpcpp-cmake/CMakeLists.txt index 6e655d3c255..e5c17c234e9 100644 --- a/contrib/amqpcpp-cmake/CMakeLists.txt +++ b/contrib/amqpcpp-cmake/CMakeLists.txt @@ -5,6 +5,7 @@ if (NOT ENABLE_AMQPCPP) return() endif() +# can be removed once libuv build on MacOS with GCC is possible if (NOT TARGET ch_contrib::uv) message(STATUS "Not using AMQP-CPP because libuv is disabled") return() diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt index a4574493440..74bbb300fa5 100644 --- a/contrib/arrow-cmake/CMakeLists.txt +++ b/contrib/arrow-cmake/CMakeLists.txt @@ -20,7 +20,7 @@ endif() option (ENABLE_PARQUET "Enable parquet" ${ENABLE_PARQUET_DEFAULT}) if (NOT ENABLE_PARQUET) - message(STATUS "Building without Parquet support") + message(STATUS "Not using parquet") return() endif() diff --git a/contrib/bzip2-cmake/CMakeLists.txt b/contrib/bzip2-cmake/CMakeLists.txt index 2e01a624000..693d4c1663c 100644 --- a/contrib/bzip2-cmake/CMakeLists.txt +++ b/contrib/bzip2-cmake/CMakeLists.txt @@ -1,6 +1,6 @@ option(ENABLE_BZIP2 "Enable bzip2 compression support" ${ENABLE_LIBRARIES}) if (NOT ENABLE_BZIP2) - message (STATUS "bzip2 compression disabled") + message (STATUS "Not using bzip2") return() endif() @@ -26,8 +26,4 @@ configure_file ( add_library(_bzip2 ${SRCS}) add_library(ch_contrib::bzip2 ALIAS _bzip2) -# To avoid -Wreserved-id-macro we use SYSTEM: -# -# clickhouse/contrib/bzip2/bzlib.h:23:9: error: macro name is a reserved identifier [-Werror,-Wreserved-id-macro] -# #define _BZLIB_H target_include_directories(_bzip2 SYSTEM BEFORE PUBLIC "${BZIP2_SOURCE_DIR}" "${BZIP2_BINARY_DIR}") diff --git a/contrib/cassandra-cmake/CMakeLists.txt b/contrib/cassandra-cmake/CMakeLists.txt index 986ac438bb2..59ff908b63a 100644 --- a/contrib/cassandra-cmake/CMakeLists.txt +++ b/contrib/cassandra-cmake/CMakeLists.txt @@ -5,6 +5,7 @@ if (NOT ENABLE_CASSANDRA) return() endif() +# can be removed once libuv build on MacOS with GCC is possible if (NOT TARGET ch_contrib::uv) message(STATUS "Not using cassandra because libuv is disabled") return() diff --git a/contrib/cppkafka-cmake/CMakeLists.txt b/contrib/cppkafka-cmake/CMakeLists.txt index 87bf2356a80..fa1c52180e8 100644 --- a/contrib/cppkafka-cmake/CMakeLists.txt +++ b/contrib/cppkafka-cmake/CMakeLists.txt @@ -1,5 +1,5 @@ if (NOT ENABLE_KAFKA) - message(STATUS "Not using librdkafka (skip cppkafka)") + message(STATUS "Not using kafka") return() endif() diff --git a/contrib/fastops-cmake/CMakeLists.txt b/contrib/fastops-cmake/CMakeLists.txt index 17d6a7f5fcb..e9aa4803583 100644 --- a/contrib/fastops-cmake/CMakeLists.txt +++ b/contrib/fastops-cmake/CMakeLists.txt @@ -5,7 +5,7 @@ elseif(ENABLE_FASTOPS) endif() if(NOT ENABLE_FASTOPS) - message(STATUS "Not using fast vectorized mathematical functions library by Mikhail Parakhin") + message(STATUS "Not using fastops") return() endif() diff --git a/contrib/fmtlib-cmake/CMakeLists.txt b/contrib/fmtlib-cmake/CMakeLists.txt index fecec5f3e43..fe399ddc6e1 100644 --- a/contrib/fmtlib-cmake/CMakeLists.txt +++ b/contrib/fmtlib-cmake/CMakeLists.txt @@ -1,22 +1,24 @@ +set(FMT_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/fmtlib") + set (SRCS # NOTE: do not build module for now: # ../fmtlib/src/fmt.cc - ../fmtlib/src/format.cc - ../fmtlib/src/os.cc + ${FMT_SOURCE_DIR}/src/format.cc + ${FMT_SOURCE_DIR}/src/os.cc - ../fmtlib/include/fmt/args.h - ../fmtlib/include/fmt/chrono.h - ../fmtlib/include/fmt/color.h - ../fmtlib/include/fmt/compile.h - ../fmtlib/include/fmt/core.h - ../fmtlib/include/fmt/format.h - ../fmtlib/include/fmt/format-inl.h - ../fmtlib/include/fmt/locale.h - ../fmtlib/include/fmt/os.h - ../fmtlib/include/fmt/ostream.h - ../fmtlib/include/fmt/printf.h - ../fmtlib/include/fmt/ranges.h - ../fmtlib/include/fmt/xchar.h + ${FMT_SOURCE_DIR}/include/fmt/args.h + ${FMT_SOURCE_DIR}/include/fmt/chrono.h + ${FMT_SOURCE_DIR}/include/fmt/color.h + ${FMT_SOURCE_DIR}/include/fmt/compile.h + ${FMT_SOURCE_DIR}/include/fmt/core.h + ${FMT_SOURCE_DIR}/include/fmt/format.h + ${FMT_SOURCE_DIR}/include/fmt/format-inl.h + ${FMT_SOURCE_DIR}/include/fmt/locale.h + ${FMT_SOURCE_DIR}/include/fmt/os.h + ${FMT_SOURCE_DIR}/include/fmt/ostream.h + ${FMT_SOURCE_DIR}/include/fmt/printf.h + ${FMT_SOURCE_DIR}/include/fmt/ranges.h + ${FMT_SOURCE_DIR}/include/fmt/xchar.h ) add_library(_fmt ${SRCS}) diff --git a/contrib/h3-cmake/CMakeLists.txt b/contrib/h3-cmake/CMakeLists.txt index 984d1b1ae7c..869550224e6 100644 --- a/contrib/h3-cmake/CMakeLists.txt +++ b/contrib/h3-cmake/CMakeLists.txt @@ -9,23 +9,23 @@ set(H3_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/h3/src/h3lib") set(H3_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/h3/src/h3lib") set(SRCS -"${H3_SOURCE_DIR}/lib/algos.c" -"${H3_SOURCE_DIR}/lib/coordijk.c" -"${H3_SOURCE_DIR}/lib/bbox.c" -"${H3_SOURCE_DIR}/lib/polygon.c" -"${H3_SOURCE_DIR}/lib/h3Index.c" -"${H3_SOURCE_DIR}/lib/vec2d.c" -"${H3_SOURCE_DIR}/lib/vec3d.c" -"${H3_SOURCE_DIR}/lib/vertex.c" -"${H3_SOURCE_DIR}/lib/linkedGeo.c" -"${H3_SOURCE_DIR}/lib/localij.c" -"${H3_SOURCE_DIR}/lib/latLng.c" -"${H3_SOURCE_DIR}/lib/directedEdge.c" -"${H3_SOURCE_DIR}/lib/mathExtensions.c" -"${H3_SOURCE_DIR}/lib/iterators.c" -"${H3_SOURCE_DIR}/lib/vertexGraph.c" -"${H3_SOURCE_DIR}/lib/faceijk.c" -"${H3_SOURCE_DIR}/lib/baseCells.c" + "${H3_SOURCE_DIR}/lib/algos.c" + "${H3_SOURCE_DIR}/lib/coordijk.c" + "${H3_SOURCE_DIR}/lib/bbox.c" + "${H3_SOURCE_DIR}/lib/polygon.c" + "${H3_SOURCE_DIR}/lib/h3Index.c" + "${H3_SOURCE_DIR}/lib/vec2d.c" + "${H3_SOURCE_DIR}/lib/vec3d.c" + "${H3_SOURCE_DIR}/lib/vertex.c" + "${H3_SOURCE_DIR}/lib/linkedGeo.c" + "${H3_SOURCE_DIR}/lib/localij.c" + "${H3_SOURCE_DIR}/lib/latLng.c" + "${H3_SOURCE_DIR}/lib/directedEdge.c" + "${H3_SOURCE_DIR}/lib/mathExtensions.c" + "${H3_SOURCE_DIR}/lib/iterators.c" + "${H3_SOURCE_DIR}/lib/vertexGraph.c" + "${H3_SOURCE_DIR}/lib/faceijk.c" + "${H3_SOURCE_DIR}/lib/baseCells.c" ) configure_file("${H3_SOURCE_DIR}/include/h3api.h.in" "${H3_BINARY_DIR}/include/h3api.h") diff --git a/contrib/hive-metastore-cmake/CMakeLists.txt b/contrib/hive-metastore-cmake/CMakeLists.txt index 9069d46cea7..a5e16c739af 100644 --- a/contrib/hive-metastore-cmake/CMakeLists.txt +++ b/contrib/hive-metastore-cmake/CMakeLists.txt @@ -5,7 +5,7 @@ elseif(ENABLE_HIVE) endif() if (NOT ENABLE_HIVE) - message("Hive disabled") + message(STATUS "Not using hive") return() endif() diff --git a/contrib/libcpuid-cmake/CMakeLists.txt b/contrib/libcpuid-cmake/CMakeLists.txt index 1940b39b6aa..95f653c7ea2 100644 --- a/contrib/libcpuid-cmake/CMakeLists.txt +++ b/contrib/libcpuid-cmake/CMakeLists.txt @@ -6,7 +6,7 @@ elseif(ENABLE_CPUID) endif() if (NOT ENABLE_CPUID) - message("Not using cpuid") + message(STATUS "Not using cpuid") return() endif() diff --git a/contrib/libgsasl-cmake/CMakeLists.txt b/contrib/libgsasl-cmake/CMakeLists.txt index 4bb4ca9dc33..3cf087c2f4c 100644 --- a/contrib/libgsasl-cmake/CMakeLists.txt +++ b/contrib/libgsasl-cmake/CMakeLists.txt @@ -1,7 +1,7 @@ option(ENABLE_GSASL_LIBRARY "Enable gsasl library" ${ENABLE_LIBRARIES}) if (NOT ENABLE_GSASL_LIBRARY) - message(STATUS "Not using gsasl library") + message(STATUS "Not using gsasl") return() endif() diff --git a/contrib/libuv-cmake/CMakeLists.txt b/contrib/libuv-cmake/CMakeLists.txt index 45f6d8e2083..1a7714e47ce 100644 --- a/contrib/libuv-cmake/CMakeLists.txt +++ b/contrib/libuv-cmake/CMakeLists.txt @@ -1,3 +1,4 @@ +# once fixed, please remove similar places in CMakeLists of libuv users (search "ch_contrib::uv") if (OS_DARWIN AND COMPILER_GCC) message (WARNING "libuv cannot be built with GCC in macOS due to a bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93082") return() diff --git a/contrib/minizip-ng-cmake/CMakeLists.txt b/contrib/minizip-ng-cmake/CMakeLists.txt index 4aabbd3c9fb..043f0fc68f9 100644 --- a/contrib/minizip-ng-cmake/CMakeLists.txt +++ b/contrib/minizip-ng-cmake/CMakeLists.txt @@ -1,6 +1,6 @@ option(ENABLE_MINIZIP "Enable minizip-ng the zip manipulation library" ${ENABLE_LIBRARIES}) if (NOT ENABLE_MINIZIP) - message (STATUS "minizip-ng disabled") + message (STATUS "Not using minizip-ng") return() endif() diff --git a/contrib/nanodbc-cmake/CMakeLists.txt b/contrib/nanodbc-cmake/CMakeLists.txt index 9ed6c9525b6..7aacf5bed7e 100644 --- a/contrib/nanodbc-cmake/CMakeLists.txt +++ b/contrib/nanodbc-cmake/CMakeLists.txt @@ -2,12 +2,12 @@ if (NOT ENABLE_ODBC) return () endif () -set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/nanodbc") - if (NOT TARGET ch_contrib::unixodbc) message(FATAL_ERROR "Configuration error: unixodbc is not a target") endif() +set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/nanodbc") + set (SRCS "${LIBRARY_DIR}/nanodbc/nanodbc.cpp" ) diff --git a/contrib/thrift-cmake/CMakeLists.txt b/contrib/thrift-cmake/CMakeLists.txt index 2a62a6fe7ab..6f94c1ebdc0 100644 --- a/contrib/thrift-cmake/CMakeLists.txt +++ b/contrib/thrift-cmake/CMakeLists.txt @@ -1,7 +1,7 @@ option(ENABLE_THRIFT "Enable Thrift" ${ENABLE_LIBRARIES}) if (NOT ENABLE_THRIFT) - message (STATUS "thrift disabled") + message (STATUS "Not using thrift") return() endif() From a4e037c728e3f8fbbc0e038d1173dbac4c9bde67 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 1 Jun 2022 10:24:12 +0200 Subject: [PATCH 34/69] Remove unused M_LIBRARY link --- contrib/brotli-cmake/CMakeLists.txt | 3 --- contrib/h3-cmake/CMakeLists.txt | 3 --- contrib/libxml2-cmake/CMakeLists.txt | 3 --- contrib/s2geometry-cmake/CMakeLists.txt | 4 ---- 4 files changed, 13 deletions(-) diff --git a/contrib/brotli-cmake/CMakeLists.txt b/contrib/brotli-cmake/CMakeLists.txt index c81a6bf9076..b89e81ecda1 100644 --- a/contrib/brotli-cmake/CMakeLists.txt +++ b/contrib/brotli-cmake/CMakeLists.txt @@ -45,7 +45,4 @@ add_library(ch_contrib::brotli ALIAS _brotli) target_include_directories(_brotli SYSTEM BEFORE PUBLIC "${BROTLI_SOURCE_DIR}/include") -if(M_LIBRARY) - target_link_libraries(_brotli PRIVATE ${M_LIBRARY}) -endif() target_compile_definitions(_brotli PRIVATE BROTLI_BUILD_PORTABLE=1) diff --git a/contrib/h3-cmake/CMakeLists.txt b/contrib/h3-cmake/CMakeLists.txt index 869550224e6..c0c2162bd26 100644 --- a/contrib/h3-cmake/CMakeLists.txt +++ b/contrib/h3-cmake/CMakeLists.txt @@ -34,8 +34,5 @@ add_library(_h3 ${SRCS}) target_include_directories(_h3 SYSTEM PUBLIC "${H3_SOURCE_DIR}/include") target_include_directories(_h3 SYSTEM PUBLIC "${H3_BINARY_DIR}/include") target_compile_definitions(_h3 PRIVATE H3_HAVE_VLA) -if(M_LIBRARY) - target_link_libraries(_h3 PRIVATE ${M_LIBRARY}) -endif() add_library(ch_contrib::h3 ALIAS _h3) diff --git a/contrib/libxml2-cmake/CMakeLists.txt b/contrib/libxml2-cmake/CMakeLists.txt index e9c4641c161..a84936f8e3a 100644 --- a/contrib/libxml2-cmake/CMakeLists.txt +++ b/contrib/libxml2-cmake/CMakeLists.txt @@ -53,9 +53,6 @@ set(SRCS add_library(_libxml2 ${SRCS}) target_link_libraries(_libxml2 PRIVATE ch_contrib::zlib) -if(M_LIBRARY) - target_link_libraries(_libxml2 PRIVATE ${M_LIBRARY}) -endif() target_include_directories(_libxml2 BEFORE PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/include") target_include_directories(_libxml2 BEFORE PUBLIC "${LIBXML2_SOURCE_DIR}/include") diff --git a/contrib/s2geometry-cmake/CMakeLists.txt b/contrib/s2geometry-cmake/CMakeLists.txt index 49c80e45b18..102ceb0db3c 100644 --- a/contrib/s2geometry-cmake/CMakeLists.txt +++ b/contrib/s2geometry-cmake/CMakeLists.txt @@ -149,7 +149,3 @@ target_link_libraries(_s2 PRIVATE target_include_directories(_s2 SYSTEM BEFORE PUBLIC "${S2_SOURCE_DIR}/") target_include_directories(_s2 SYSTEM PUBLIC "${ABSL_SOURCE_DIR}") - -if(M_LIBRARY) - target_link_libraries(_s2 PRIVATE ${M_LIBRARY}) -endif() From 933f98a900a5dfe4ad06774133f81281b73780b5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 1 Jun 2022 10:47:11 +0200 Subject: [PATCH 35/69] Removed warning flags in contribs warnings are disabled for all contribs in contrib/CMakeLists.txt already --- contrib/amqpcpp-cmake/CMakeLists.txt | 15 --------------- contrib/avro-cmake/CMakeLists.txt | 8 -------- contrib/azure-cmake/CMakeLists.txt | 14 -------------- contrib/capnproto-cmake/CMakeLists.txt | 12 ++++-------- contrib/icu-cmake/CMakeLists.txt | 4 ---- contrib/jemalloc-cmake/CMakeLists.txt | 1 - contrib/libcpuid-cmake/CMakeLists.txt | 3 --- contrib/replxx-cmake/CMakeLists.txt | 5 ----- contrib/unixodbc-cmake/CMakeLists.txt | 10 +--------- 9 files changed, 5 insertions(+), 67 deletions(-) diff --git a/contrib/amqpcpp-cmake/CMakeLists.txt b/contrib/amqpcpp-cmake/CMakeLists.txt index e5c17c234e9..6f6a0188e6f 100644 --- a/contrib/amqpcpp-cmake/CMakeLists.txt +++ b/contrib/amqpcpp-cmake/CMakeLists.txt @@ -38,21 +38,6 @@ set (SRCS add_library(_amqp-cpp ${SRCS}) -target_compile_options (_amqp-cpp - PRIVATE - -Wno-old-style-cast - -Wno-inconsistent-missing-destructor-override - -Wno-deprecated - -Wno-unused-parameter - -Wno-shadow - -Wno-tautological-type-limit-compare - -Wno-extra-semi -# NOTE: disable all warnings at last because the warning: - # "conversion function converting 'XXX' to itself will never be used" - # doesn't have it's own diagnostic flag yet. - -w -) - target_include_directories (_amqp-cpp SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/include" "${LIBRARY_DIR}") target_link_libraries (_amqp-cpp PUBLIC OpenSSL::Crypto OpenSSL::SSL ch_contrib::uv) add_library (ch_contrib::amqp_cpp ALIAS _amqp-cpp) diff --git a/contrib/avro-cmake/CMakeLists.txt b/contrib/avro-cmake/CMakeLists.txt index c5bda41782d..25474650d0e 100644 --- a/contrib/avro-cmake/CMakeLists.txt +++ b/contrib/avro-cmake/CMakeLists.txt @@ -60,14 +60,6 @@ target_compile_definitions (_avrocpp PUBLIC SNAPPY_CODEC_AVAILABLE) target_include_directories (_avrocpp PRIVATE ${SNAPPY_INCLUDE_DIR}) target_link_libraries (_avrocpp PRIVATE ch_contrib::snappy) -if (COMPILER_GCC) - set (SUPPRESS_WARNINGS -Wno-non-virtual-dtor) -elseif (COMPILER_CLANG) - set (SUPPRESS_WARNINGS -Wno-non-virtual-dtor) -endif () - -target_compile_options(_avrocpp PRIVATE ${SUPPRESS_WARNINGS}) - # create a symlink to include headers with set(AVRO_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/include") ADD_CUSTOM_TARGET(avro_symlink_headers ALL diff --git a/contrib/azure-cmake/CMakeLists.txt b/contrib/azure-cmake/CMakeLists.txt index 031d8dc9a0b..19f2940cbf0 100644 --- a/contrib/azure-cmake/CMakeLists.txt +++ b/contrib/azure-cmake/CMakeLists.txt @@ -52,20 +52,6 @@ include("${AZURE_DIR}/cmake-modules/AzureTransportAdapters.cmake") add_library(_azure_sdk ${AZURE_SDK_UNIFIED_SRC}) -if (COMPILER_CLANG) - target_compile_options(_azure_sdk PRIVATE - -Wno-deprecated-copy-dtor - -Wno-extra-semi - -Wno-suggest-destructor-override - -Wno-inconsistent-missing-destructor-override - -Wno-error=unknown-warning-option - ) - - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13) - target_compile_options(_azure_sdk PRIVATE -Wno-reserved-identifier) - endif() -endif() - # Originally, on Windows azure-core is built with bcrypt and crypt32 by default if (TARGET OpenSSL::SSL) target_link_libraries(_azure_sdk PRIVATE OpenSSL::Crypto OpenSSL::SSL) diff --git a/contrib/capnproto-cmake/CMakeLists.txt b/contrib/capnproto-cmake/CMakeLists.txt index 297b847cd58..e76268592ee 100644 --- a/contrib/capnproto-cmake/CMakeLists.txt +++ b/contrib/capnproto-cmake/CMakeLists.txt @@ -81,16 +81,12 @@ set (CAPNPC_SRCS add_library(_capnpc ${CAPNPC_SRCS}) target_link_libraries(_capnpc PUBLIC _capnp) -# The library has substandard code -if (COMPILER_GCC) - set (SUPPRESS_WARNINGS -w) -elseif (COMPILER_CLANG) - set (SUPPRESS_WARNINGS -w) +if (COMPILER_CLANG) set (CAPNP_PRIVATE_CXX_FLAGS -fno-char8_t) endif () -target_compile_options(_kj PRIVATE ${SUPPRESS_WARNINGS} ${CAPNP_PRIVATE_CXX_FLAGS}) -target_compile_options(_capnp PRIVATE ${SUPPRESS_WARNINGS} ${CAPNP_PRIVATE_CXX_FLAGS}) -target_compile_options(_capnpc PRIVATE ${SUPPRESS_WARNINGS} ${CAPNP_PRIVATE_CXX_FLAGS}) +target_compile_options(_kj PRIVATE ${CAPNP_PRIVATE_CXX_FLAGS}) +target_compile_options(_capnp PRIVATE ${CAPNP_PRIVATE_CXX_FLAGS}) +target_compile_options(_capnpc PRIVATE ${CAPNP_PRIVATE_CXX_FLAGS}) add_library(ch_contrib::capnp ALIAS _capnpc) diff --git a/contrib/icu-cmake/CMakeLists.txt b/contrib/icu-cmake/CMakeLists.txt index 9c34228e2a0..ce82155218c 100644 --- a/contrib/icu-cmake/CMakeLists.txt +++ b/contrib/icu-cmake/CMakeLists.txt @@ -481,10 +481,6 @@ target_include_directories(_icui18n SYSTEM PUBLIC "${ICU_SOURCE_DIR}/i18n/") target_compile_definitions(_icuuc PRIVATE -DU_COMMON_IMPLEMENTATION) target_compile_definitions(_icui18n PRIVATE -DU_I18N_IMPLEMENTATION) -if (COMPILER_CLANG) - target_compile_options(_icudata PRIVATE -Wno-unused-command-line-argument) -endif () - add_library(_icu INTERFACE) target_link_libraries(_icu INTERFACE _icui18n _icuuc _icudata) add_library(ch_contrib::icu ALIAS _icu) diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt index c59b4da890b..fdb0fd0e8af 100644 --- a/contrib/jemalloc-cmake/CMakeLists.txt +++ b/contrib/jemalloc-cmake/CMakeLists.txt @@ -180,7 +180,6 @@ if (USE_UNWIND) target_link_libraries (_jemalloc PRIVATE unwind) endif () -target_compile_options(_jemalloc PRIVATE -Wno-redundant-decls) # for RTLD_NEXT target_compile_options(_jemalloc PRIVATE -D_GNU_SOURCE) diff --git a/contrib/libcpuid-cmake/CMakeLists.txt b/contrib/libcpuid-cmake/CMakeLists.txt index 95f653c7ea2..fd5af925c57 100644 --- a/contrib/libcpuid-cmake/CMakeLists.txt +++ b/contrib/libcpuid-cmake/CMakeLists.txt @@ -27,8 +27,5 @@ add_library (_cpuid ${SRCS}) target_include_directories (_cpuid SYSTEM PUBLIC "${LIBRARY_DIR}") target_compile_definitions (_cpuid PRIVATE VERSION="v0.4.1") -if (COMPILER_CLANG) - target_compile_options (_cpuid PRIVATE -Wno-reserved-id-macro) -endif () add_library(ch_contrib::cpuid ALIAS _cpuid) diff --git a/contrib/replxx-cmake/CMakeLists.txt b/contrib/replxx-cmake/CMakeLists.txt index c7cf6eb7687..95a19875621 100644 --- a/contrib/replxx-cmake/CMakeLists.txt +++ b/contrib/replxx-cmake/CMakeLists.txt @@ -15,9 +15,4 @@ set(SRCS add_library (_replxx ${SRCS}) target_include_directories(_replxx SYSTEM PUBLIC "${LIBRARY_DIR}/include") - -if (COMPILER_CLANG) - target_compile_options(_replxx PRIVATE -Wno-documentation) -endif () - add_library(ch_contrib::replxx ALIAS _replxx) diff --git a/contrib/unixodbc-cmake/CMakeLists.txt b/contrib/unixodbc-cmake/CMakeLists.txt index b594ead3ba0..3317654cd67 100644 --- a/contrib/unixodbc-cmake/CMakeLists.txt +++ b/contrib/unixodbc-cmake/CMakeLists.txt @@ -294,14 +294,6 @@ target_include_directories (_unixodbc "${LIBRARY_DIR}/include" ) target_compile_definitions (_unixodbc PRIVATE -DHAVE_CONFIG_H) -target_compile_options (_unixodbc - PRIVATE - -Wno-dangling-else - -Wno-parentheses - -Wno-misleading-indentation - -Wno-unknown-warning-option - -Wno-reserved-id-macro - -O2 -) +target_compile_options (_unixodbc PRIVATE -O2) # intended? add_library (ch_contrib::unixodbc ALIAS _unixodbc) From 393b97763a7879d5df91023b16e296a6a5c2492e Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 1 Jun 2022 11:18:56 +0200 Subject: [PATCH 36/69] Make SSL a mandatory dependency for now - SSL is a dependency of too many libs + unit tests (via poco crypto which requires SSL) - optional SSL is desirable but right now, turning off SSL (via -DENABLE_LIBRARIES=0 or =DENABLE_SSL=0) breaks the build - therefore make SSL mandatory for now + add a TODO comment --- contrib/boringssl-cmake/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/contrib/boringssl-cmake/CMakeLists.txt b/contrib/boringssl-cmake/CMakeLists.txt index 180fb3874c1..faee2dfddb3 100644 --- a/contrib/boringssl-cmake/CMakeLists.txt +++ b/contrib/boringssl-cmake/CMakeLists.txt @@ -1,7 +1,12 @@ # Needed for: # - securely connecting to an external server, e.g. clickhouse-client --host ... --secure # - lots of thirdparty libraries -option(ENABLE_SSL "Enable ssl" ${ENABLE_LIBRARIES}) + +# Actually, so many 3rd party libraries + unit tests need SSL that we cannot disable it +# without breaking the build ... +option(ENABLE_SSL "Enable ssl" ON) # breaks if OFF +# TODO: Making SSL dependent on ENABLE_LIBRARIES is desirable but needs fixing dependent libs + tests. +# option(ENABLE_SSL "Enable ssl" ${ENABLE_LIBRARIES}) if(NOT ENABLE_SSL) message(STATUS "Not using openssl") From 600512cc08622b672a0876d14d3b61a6f011d6a3 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 31 May 2022 09:15:59 +0200 Subject: [PATCH 37/69] Replace exceptions thrown for programming errors by asserts --- src/Functions/MatchImpl.h | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index 96ce0ca2eb0..78ce2627c35 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -18,8 +18,6 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_COLUMN; - extern const int LOGICAL_ERROR; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; } namespace impl @@ -112,16 +110,14 @@ struct MatchImpl const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const String & needle, - const ColumnPtr & start_pos_, + [[maybe_unused]] const ColumnPtr & start_pos_, PaddedPODArray & res) { const size_t haystack_size = haystack_offsets.size(); - if (haystack_size != res.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Function '{}' unexpectedly received a different number of haystacks and results", name); + assert(haystack_size == res.size()); - if (start_pos_ != nullptr) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function '{}' doesn't support start_pos argument", name); + assert(start_pos_ == nullptr); if (haystack_offsets.empty()) return; @@ -274,8 +270,7 @@ struct MatchImpl { const size_t haystack_size = haystack.size() / N; - if (haystack_size != res.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Function '{}' unexpectedly received a different number of haystacks and results", name); + assert(haystack_size == res.size()); if (haystack.empty()) return; @@ -433,16 +428,15 @@ struct MatchImpl const ColumnString::Offsets & haystack_offsets, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offset, - const ColumnPtr & start_pos_, + [[maybe_unused]] const ColumnPtr & start_pos_, PaddedPODArray & res) { const size_t haystack_size = haystack_offsets.size(); - if (haystack_size != needle_offset.size() || haystack_size != res.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Function '{}' unexpectedly received a different number of haystacks, needles and results", name); + assert(haystack_size == needle_offset.size()); + assert(haystack_size == res.size()); - if (start_pos_ != nullptr) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function '{}' doesn't support start_pos argument", name); + assert(start_pos_ == nullptr); if (haystack_offsets.empty()) return; @@ -547,16 +541,15 @@ struct MatchImpl size_t N, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offset, - const ColumnPtr & start_pos_, + [[maybe_unused]] const ColumnPtr & start_pos_, PaddedPODArray & res) { const size_t haystack_size = haystack.size()/N; - if (haystack_size != needle_offset.size() || haystack_size != res.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Function '{}' unexpectedly received a different number of haystacks, needles and results", name); + assert(haystack_size == needle_offset.size()); + assert(haystack_size == res.size()); - if (start_pos_ != nullptr) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function '{}' doesn't support start_pos argument", name); + assert(start_pos_ == nullptr); if (haystack.empty()) return; From 81318e07d642def5090753c34082e859f2a42a65 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 31 May 2022 09:29:04 +0200 Subject: [PATCH 38/69] Try to fix performance test results --- ...h_pattern_caching.xml => re2_regex_caching.xml} | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) rename tests/performance/{like_and_match_pattern_caching.xml => re2_regex_caching.xml} (88%) diff --git a/tests/performance/like_and_match_pattern_caching.xml b/tests/performance/re2_regex_caching.xml similarity index 88% rename from tests/performance/like_and_match_pattern_caching.xml rename to tests/performance/re2_regex_caching.xml index c0a8ec9442e..6edc83097ba 100644 --- a/tests/performance/like_and_match_pattern_caching.xml +++ b/tests/performance/re2_regex_caching.xml @@ -5,15 +5,15 @@ numbers - numbers_mt(2000000) + numbers_mt(1500000) needle_like - simple patterns, all unique + '%' || toString(number) || '_' - simple patterns, low distinctness (10 patterns) + '%' || toString(number % 10) || '_' @@ -40,23 +40,27 @@ select toString(number) as haystack, like(haystack, '%x_') from(select * from {numbers}) + format Null select toString(number) as haystack, match(haystack, '.*x.') from(select * from {numbers}) + format Null select toString(number) as haystack, {needle_like} as needle, like(haystack, needle) - from (select * from {numbers}); + from (select * from {numbers}) + format Null select toString(number) as haystack, {needle_match} as needle, match(haystack, needle) - from (select * from {numbers}); + from (select * from {numbers}) + format Null From d84a21aea50d8683426e6c56e2c6522e750282e8 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 1 Jun 2022 12:57:19 +0200 Subject: [PATCH 39/69] Fixed tests --- tests/queries/0_stateless/02315_readonly_create_function.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02315_readonly_create_function.sh b/tests/queries/0_stateless/02315_readonly_create_function.sh index 03a4feb3038..07e97355883 100755 --- a/tests/queries/0_stateless/02315_readonly_create_function.sh +++ b/tests/queries/0_stateless/02315_readonly_create_function.sh @@ -1,4 +1,4 @@ -CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=none +#!/usr/bin/env bash CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 9aee3f3156c888a2ad816347898d30684edf66b2 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 1 Jun 2022 13:06:44 +0200 Subject: [PATCH 40/69] Fixed tests --- .../functions/test_function_config.xml | 2 +- .../test.py | 33 ++++++++++++++++--- .../user_scripts/input_nullable.py | 2 +- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml b/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml index dce7ab2eacd..5da2e854da8 100644 --- a/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml +++ b/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml @@ -301,7 +301,7 @@ - executable + executable_pool test_function_nullable_pool_python String diff --git a/tests/integration/test_executable_user_defined_function/test.py b/tests/integration/test_executable_user_defined_function/test.py index 20beded2284..f48547a1437 100644 --- a/tests/integration/test_executable_user_defined_function/test.py +++ b/tests/integration/test_executable_user_defined_function/test.py @@ -229,14 +229,37 @@ def test_executable_function_sum_json_python(started_cluster): node.query("DROP TABLE test_table;") + def test_executable_function_input_nullable_python(started_cluster): skip_test_msan(node) - node.query("CREATE TABLE test_table_nullable (value Nullable(UInt64)) ENGINE=TinyLog;") + node.query( + "CREATE TABLE test_table_nullable (value Nullable(UInt64)) ENGINE=TinyLog;" + ) node.query("INSERT INTO test_table_nullable VALUES (0), (NULL), (2);") - assert(node.query("SELECT test_function_nullable_python(1), test_function_nullable_python(NULL)") == "Key 1\tKey Nullable\n") - assert(node.query("SELECT test_function_nullable_python(value) FROM test_table_nullable;") == "Key 0\nKey Nullable\nKey 2\n") + assert ( + node.query( + "SELECT test_function_nullable_python(1), test_function_nullable_python(NULL)" + ) + == "Key 1\tKey Nullable\n" + ) + assert ( + node.query( + "SELECT test_function_nullable_python(value) FROM test_table_nullable;" + ) + == "Key 0\nKey Nullable\nKey 2\n" + ) - assert(node.query("SELECT test_function_nullable_pool_python(1), test_function_nullable_pool_python(NULL)") == "Key 1\tKey Nullable\n") - assert(node.query("SELECT test_function_nullable_pool_python(value) FROM test_table_nullable;") == "Key 0\nKey Nullable\nKey 2\n") + assert ( + node.query( + "SELECT test_function_nullable_pool_python(1), test_function_nullable_pool_python(NULL)" + ) + == "Key 1\tKey Nullable\n" + ) + assert ( + node.query( + "SELECT test_function_nullable_pool_python(value) FROM test_table_nullable;" + ) + == "Key 0\nKey Nullable\nKey 2\n" + ) diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_nullable.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_nullable.py index f2f00a335bd..d1a825cf849 100755 --- a/tests/integration/test_executable_user_defined_function/user_scripts/input_nullable.py +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_nullable.py @@ -4,7 +4,7 @@ import sys if __name__ == "__main__": for line in sys.stdin: - if (line == "\\N\n"): + if line == "\\N\n": print("Key Nullable", end="\n") else: print("Key " + line, end="") From ded1398565ab3bafb049f1f40d6461204b86f15a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 1 Jun 2022 10:26:43 +0000 Subject: [PATCH 41/69] Fix intersect with const string --- src/Common/ColumnsHashing.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index c3a087c0a6e..e921f4fbf9a 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -6,9 +6,11 @@ #include #include #include +#include "Columns/IColumn.h" #include #include +#include #include #include @@ -83,8 +85,11 @@ struct HashMethodString HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { - const IColumn & column = *key_columns[0]; - const ColumnString & column_string = assert_cast(column); + const IColumn * column = key_columns[0]; + if (isColumnConst(*column)) + column = &assert_cast(*column).getDataColumn(); + + const ColumnString & column_string = assert_cast(*column); offsets = column_string.getOffsets().data(); chars = column_string.getChars().data(); } From 6c31d06b2ecb3e9a3a0afa8e257a27abc7b0629a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 1 Jun 2022 11:17:56 +0000 Subject: [PATCH 42/69] Add test for const string intersect --- tests/queries/0_stateless/02316_const_string_intersact.reference | 1 + tests/queries/0_stateless/02316_const_string_intersact.sql | 1 + 2 files changed, 2 insertions(+) create mode 100644 tests/queries/0_stateless/02316_const_string_intersact.reference create mode 100644 tests/queries/0_stateless/02316_const_string_intersact.sql diff --git a/tests/queries/0_stateless/02316_const_string_intersact.reference b/tests/queries/0_stateless/02316_const_string_intersact.reference new file mode 100644 index 00000000000..957124d5fdd --- /dev/null +++ b/tests/queries/0_stateless/02316_const_string_intersact.reference @@ -0,0 +1 @@ +Play ClickHouse diff --git a/tests/queries/0_stateless/02316_const_string_intersact.sql b/tests/queries/0_stateless/02316_const_string_intersact.sql new file mode 100644 index 00000000000..ace3c8d03c5 --- /dev/null +++ b/tests/queries/0_stateless/02316_const_string_intersact.sql @@ -0,0 +1 @@ +SELECT 'Play ClickHouse' InterSect SELECT 'Play ClickHouse' From f49dd19e7ad2d7c7fa94459bce1e3226ba684da7 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 1 Jun 2022 11:43:58 +0000 Subject: [PATCH 43/69] Revert "Initialize ParallelReadBuffer after construction" This reverts commit 31e1e678366956ce6585a611baeb3dc53301641a. --- src/IO/ParallelReadBuffer.cpp | 13 ------------- src/IO/ParallelReadBuffer.h | 5 ----- src/Storages/StorageS3.cpp | 5 +---- src/Storages/StorageURL.cpp | 10 +++++----- 4 files changed, 6 insertions(+), 27 deletions(-) diff --git a/src/IO/ParallelReadBuffer.cpp b/src/IO/ParallelReadBuffer.cpp index 8d776a115a7..512f1c856b7 100644 --- a/src/IO/ParallelReadBuffer.cpp +++ b/src/IO/ParallelReadBuffer.cpp @@ -48,15 +48,8 @@ ParallelReadBuffer::ParallelReadBuffer( , max_working_readers(max_working_readers_) , schedule(std::move(schedule_)) , reader_factory(std::move(reader_factory_)) -{} - -void ParallelReadBuffer::initialize() { - if (initialized) - throw Exception(ErrorCodes::LOGICAL_ERROR, "ParallelReadBuffer is initialized twice"); - addReaders(); - initialized = true; } bool ParallelReadBuffer::addReaderToPool() @@ -83,9 +76,6 @@ void ParallelReadBuffer::addReaders() off_t ParallelReadBuffer::seek(off_t offset, int whence) { - if (!initialized) - throw Exception(ErrorCodes::LOGICAL_ERROR, "ParallelReadBuffer is not initialized"); - if (whence != SEEK_SET) throw Exception("Only SEEK_SET mode is allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE); @@ -182,9 +172,6 @@ void ParallelReadBuffer::handleEmergencyStop() bool ParallelReadBuffer::nextImpl() { - if (!initialized) - throw Exception(ErrorCodes::LOGICAL_ERROR, "ParallelReadBuffer is not initialized"); - if (all_completed) return false; diff --git a/src/IO/ParallelReadBuffer.h b/src/IO/ParallelReadBuffer.h index 0568a5f0298..83b978848f8 100644 --- a/src/IO/ParallelReadBuffer.h +++ b/src/IO/ParallelReadBuffer.h @@ -40,9 +40,6 @@ public: explicit ParallelReadBuffer(std::unique_ptr reader_factory_, CallbackRunner schedule_, size_t max_working_readers); - // some readers can throw exception during constructor call so we can't initialize ParallelReadBuffer there - void initialize(); - ~ParallelReadBuffer() override { finishAndWait(); } off_t seek(off_t off, int whence) override; @@ -99,8 +96,6 @@ private: off_t current_position{0}; bool all_completed{false}; - - bool initialized{false}; }; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 39408e8ef36..393ea0e24ff 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -479,10 +479,7 @@ std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & k LOG_TRACE( log, "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", download_thread_num, object_size, download_buffer_size); - auto parallel_read_buffer - = std::make_unique(std::move(factory), threadPoolCallbackRunner(IOThreadPool::get()), download_thread_num); - parallel_read_buffer->initialize(); - return parallel_read_buffer; + return std::make_unique(std::move(factory), threadPoolCallbackRunner(IOThreadPool::get()), download_thread_num); } String StorageS3Source::getName() const diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index fdc6adb8c15..062241797e0 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -345,12 +345,12 @@ namespace /* use_external_buffer */ false, /* skip_url_not_found_error */ skip_url_not_found_error); - auto parallel_read_buffer = std::make_unique( - std::move(read_buffer_factory), threadPoolCallbackRunner(IOThreadPool::get()), download_threads); - parallel_read_buffer->initialize(); - return wrapReadBufferWithCompressionMethod( - std::move(parallel_read_buffer), chooseCompressionMethod(request_uri.getPath(), compression_method)); + std::make_unique( + std::move(read_buffer_factory), + threadPoolCallbackRunner(IOThreadPool::get()), + download_threads), + chooseCompressionMethod(request_uri.getPath(), compression_method)); } } catch (const Poco::Exception & e) From 08c20be4d087dd379c481030caefdb8d98862f10 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 1 Jun 2022 11:51:01 +0000 Subject: [PATCH 44/69] Cleaner exception handling in ParallelReadBuffer --- src/IO/ParallelReadBuffer.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/IO/ParallelReadBuffer.cpp b/src/IO/ParallelReadBuffer.cpp index 512f1c856b7..926d10bda5b 100644 --- a/src/IO/ParallelReadBuffer.cpp +++ b/src/IO/ParallelReadBuffer.cpp @@ -49,7 +49,15 @@ ParallelReadBuffer::ParallelReadBuffer( , schedule(std::move(schedule_)) , reader_factory(std::move(reader_factory_)) { - addReaders(); + try + { + addReaders(); + } + catch (const Exception &) + { + finishAndWait(); + throw; + } } bool ParallelReadBuffer::addReaderToPool() From 5a1b873f7bcbd247a5809f6794d0517efdc34dbc Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 1 Jun 2022 13:54:53 +0200 Subject: [PATCH 45/69] No need to checkout submodules/contribs recursively Also verified locally by building from a freshly cloned ClickHouse and "flat" checkout of submodules without recursion --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index abe263834ed..a6a09afc489 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,7 +36,7 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON) # Check that submodules are present if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/sysroot/README.md") - message (FATAL_ERROR "Submodules are not initialized. Run\n\tgit submodule update --init --recursive") + message (FATAL_ERROR "Submodules are not initialized. Run\n\tgit submodule update --init") endif () # Take care to add prlimit in command line before ccache, or else ccache thinks that From b62e4cec65389f1938bd2280041855ea7b744544 Mon Sep 17 00:00:00 2001 From: flynn Date: Wed, 1 Jun 2022 12:39:16 +0000 Subject: [PATCH 46/69] Fix crash of FunctionHashID --- src/Functions/FunctionHashID.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Functions/FunctionHashID.h b/src/Functions/FunctionHashID.h index fbfb368bec7..30f08c96eca 100644 --- a/src/Functions/FunctionHashID.h +++ b/src/Functions/FunctionHashID.h @@ -51,9 +51,11 @@ public: bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + bool useDefaultImplementationForConstants() const override { return true; } + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (arguments.size() < 1) + if (arguments.empty()) throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} expects at least one argument", getName()); const auto & id_col = arguments[0]; @@ -114,18 +116,16 @@ public: const auto & numcolumn = arguments[0].column; if (checkAndGetColumn(numcolumn.get()) || checkAndGetColumn(numcolumn.get()) - || checkAndGetColumn(numcolumn.get()) || checkAndGetColumn(numcolumn.get()) - || checkAndGetColumnConst(numcolumn.get()) || checkAndGetColumnConst(numcolumn.get()) - || checkAndGetColumnConst(numcolumn.get()) || checkAndGetColumnConst(numcolumn.get())) + || checkAndGetColumn(numcolumn.get()) || checkAndGetColumn(numcolumn.get())) { std::string salt; - UInt8 minLength = 0; + UInt8 min_length = 0; std::string alphabet; if (arguments.size() >= 4) { const auto & alphabetcolumn = arguments[3].column; - if (auto alpha_col = checkAndGetColumnConst(alphabetcolumn.get())) + if (const auto * alpha_col = checkAndGetColumnConst(alphabetcolumn.get())) { alphabet = alpha_col->getValue(); if (alphabet.find('\0') != std::string::npos) @@ -138,18 +138,18 @@ public: if (arguments.size() >= 3) { const auto & minlengthcolumn = arguments[2].column; - if (auto min_length_col = checkAndGetColumnConst(minlengthcolumn.get())) - minLength = min_length_col->getValue(); + if (const auto * min_length_col = checkAndGetColumnConst(minlengthcolumn.get())) + min_length = min_length_col->getValue(); } if (arguments.size() >= 2) { const auto & saltcolumn = arguments[1].column; - if (auto salt_col = checkAndGetColumnConst(saltcolumn.get())) + if (const auto * salt_col = checkAndGetColumnConst(saltcolumn.get())) salt = salt_col->getValue(); } - hashidsxx::Hashids hash(salt, minLength, alphabet); + hashidsxx::Hashids hash(salt, min_length, alphabet); auto col_res = ColumnString::create(); From ac10a6dc28e1211e213deed7e77d1688f8194a32 Mon Sep 17 00:00:00 2001 From: flynn Date: Wed, 1 Jun 2022 12:41:36 +0000 Subject: [PATCH 47/69] update test --- tests/queries/0_stateless/02293_hashid.reference | 3 ++- tests/queries/0_stateless/02293_hashid.sql | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02293_hashid.reference b/tests/queries/0_stateless/02293_hashid.reference index 9ae4cce3944..f36b1500288 100644 --- a/tests/queries/0_stateless/02293_hashid.reference +++ b/tests/queries/0_stateless/02293_hashid.reference @@ -8,4 +8,5 @@ 2 obmgndljgajpkeao 3 dldokmpjpgjgeanb 4 nkdlpgajngjnobme -YQrvD5XGvbx +xkOpDGxQpVB +jR diff --git a/tests/queries/0_stateless/02293_hashid.sql b/tests/queries/0_stateless/02293_hashid.sql index 145bd76ccbf..45aaefe7356 100644 --- a/tests/queries/0_stateless/02293_hashid.sql +++ b/tests/queries/0_stateless/02293_hashid.sql @@ -3,3 +3,5 @@ SET allow_experimental_hash_functions = 1; select number, hashid(number) from system.numbers limit 5; select number, hashid(number, 's3cr3t', 16, 'abcdefghijklmnop') from system.numbers limit 5; select hashid(1234567890123456, 's3cr3t'); + +SELECT hashid(1, hashid(2)); From 503d94f73be98b63318d82cfc9de0bb276cabb6d Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 1 Jun 2022 14:44:46 +0200 Subject: [PATCH 48/69] Update test.py --- tests/integration/test_mysql_database_engine/test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_mysql_database_engine/test.py b/tests/integration/test_mysql_database_engine/test.py index 500d9176f4b..8626980a768 100644 --- a/tests/integration/test_mysql_database_engine/test.py +++ b/tests/integration/test_mysql_database_engine/test.py @@ -931,8 +931,10 @@ def test_predefined_connection_configuration(started_cluster): ) result = clickhouse_node.query("show create table test_database.test_table") - print(result) - assert(result.strip() == "CREATE TABLE test_database.test_table\\n(\\n `id` Int32\\n)\\nENGINE = MySQL(mysql1, table = \\'test_table\\')") + assert ( + result.strip() + == "CREATE TABLE test_database.test_table\\n(\\n `id` Int32\\n)\\nENGINE = MySQL(mysql1, table = \\'test_table\\')" + ) clickhouse_node.query("DROP DATABASE test_database") clickhouse_node.query_and_get_error( From 4413f0c7c12cd7a80c75c28e28ed2dac964e6686 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 1 Jun 2022 14:52:01 +0200 Subject: [PATCH 49/69] Fixed tests --- .../0_stateless/02315_readonly_create_function.reference | 2 +- tests/queries/0_stateless/02315_readonly_create_function.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02315_readonly_create_function.reference b/tests/queries/0_stateless/02315_readonly_create_function.reference index d00491fd7e5..573541ac970 100644 --- a/tests/queries/0_stateless/02315_readonly_create_function.reference +++ b/tests/queries/0_stateless/02315_readonly_create_function.reference @@ -1 +1 @@ -1 +0 diff --git a/tests/queries/0_stateless/02315_readonly_create_function.sh b/tests/queries/0_stateless/02315_readonly_create_function.sh index 07e97355883..70e27e9ede9 100755 --- a/tests/queries/0_stateless/02315_readonly_create_function.sh +++ b/tests/queries/0_stateless/02315_readonly_create_function.sh @@ -4,4 +4,5 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_CLIENT --readonly 1 --query "CREATE FUNCTION test_function AS (x) -> x + 1;" 2>&1 | grep -c -F 'Code: 164' +$CLICKHOUSE_CLIENT --readonly 1 --query "CREATE FUNCTION test_function AS (x) -> x + 1;" 2>&1 | grep -q "Code: 164" +echo $?; From 6a5f5997cae8479d19bc21e4254c3e66f8dcf58c Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 1 Jun 2022 12:55:47 +0000 Subject: [PATCH 50/69] Add test 02315_pmj_union_ubsan_35857 --- .../02315_pmj_union_ubsan_35857.reference | 2 ++ .../02315_pmj_union_ubsan_35857.sql | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 tests/queries/0_stateless/02315_pmj_union_ubsan_35857.reference create mode 100644 tests/queries/0_stateless/02315_pmj_union_ubsan_35857.sql diff --git a/tests/queries/0_stateless/02315_pmj_union_ubsan_35857.reference b/tests/queries/0_stateless/02315_pmj_union_ubsan_35857.reference new file mode 100644 index 00000000000..96e34d5a44c --- /dev/null +++ b/tests/queries/0_stateless/02315_pmj_union_ubsan_35857.reference @@ -0,0 +1,2 @@ +\N +\N diff --git a/tests/queries/0_stateless/02315_pmj_union_ubsan_35857.sql b/tests/queries/0_stateless/02315_pmj_union_ubsan_35857.sql new file mode 100644 index 00000000000..38f1d2e1b4e --- /dev/null +++ b/tests/queries/0_stateless/02315_pmj_union_ubsan_35857.sql @@ -0,0 +1,22 @@ +SET join_algorithm = 'partial_merge'; + +SELECT NULL +FROM +( + SELECT + NULL, + 1 AS a, + 0 :: Nullable(UInt8) AS c + UNION ALL + SELECT + NULL, + 65536, + NULL +) AS js1 +ALL LEFT JOIN +( + SELECT 2 :: Nullable(UInt8) AS a +) AS js2 +USING (a) +ORDER BY c +; From 3a824ef9a45d7231b9243e21cb6be8f5712edb0d Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Wed, 1 Jun 2022 16:00:30 +0200 Subject: [PATCH 51/69] Add no-backward-compatibility-check to 02315_pmj_union_ubsan_35857 --- tests/queries/0_stateless/02315_pmj_union_ubsan_35857.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02315_pmj_union_ubsan_35857.sql b/tests/queries/0_stateless/02315_pmj_union_ubsan_35857.sql index 38f1d2e1b4e..47b47101a79 100644 --- a/tests/queries/0_stateless/02315_pmj_union_ubsan_35857.sql +++ b/tests/queries/0_stateless/02315_pmj_union_ubsan_35857.sql @@ -1,3 +1,5 @@ +-- Tags: no-backward-compatibility-check + SET join_algorithm = 'partial_merge'; SELECT NULL From 79576fa08cf0ce81f200acc901455c54f2622df7 Mon Sep 17 00:00:00 2001 From: bkuschel Date: Wed, 1 Jun 2022 07:41:27 -0700 Subject: [PATCH 52/69] Use new submodule sync --- .github/workflows/nightly.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index b2ddd87d173..3ebf58b858a 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -81,7 +81,6 @@ jobs: cat >> "$GITHUB_ENV" << 'EOF' BUILD_NAME=coverity CACHES_PATH=${{runner.temp}}/../ccaches - CHECK_NAME=ClickHouse build check (actions) IMAGES_PATH=${{runner.temp}}/images_path REPO_COPY=${{runner.temp}}/build_check/ClickHouse TEMP_PATH=${{runner.temp}}/build_check @@ -99,13 +98,15 @@ jobs: id: coverity-checkout uses: actions/checkout@v2 with: - submodules: 'true' + fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$CHECK_NAME" "$BUILD_NAME" + cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - name: Upload Coverity Analysis if: ${{ success() || failure() }} run: | From 2626a496167d42fa5953bbce51b8386ef11961ee Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 4 May 2022 20:16:42 +0300 Subject: [PATCH 53/69] FR: Expose what triggered the merge in system.part_log #26255 --- src/Interpreters/PartLog.cpp | 31 ++++++++++++++++ src/Interpreters/PartLog.h | 15 ++++++++ src/Storages/MergeTree/MergeTreeData.cpp | 4 ++ .../02293_part_log_has_merge_reason.reference | 1 + .../02293_part_log_has_merge_reason.sql | 37 +++++++++++++++++++ 5 files changed, 88 insertions(+) create mode 100644 tests/queries/0_stateless/02293_part_log_has_merge_reason.reference create mode 100644 tests/queries/0_stateless/02293_part_log_has_merge_reason.sql diff --git a/src/Interpreters/PartLog.cpp b/src/Interpreters/PartLog.cpp index 6d57f6b7045..274fc7384ab 100644 --- a/src/Interpreters/PartLog.cpp +++ b/src/Interpreters/PartLog.cpp @@ -16,6 +16,25 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +PartLogElement::MergeReasonType PartLogElement::getMergeReasonType(MergeType merge_type) { + switch (merge_type) + { + case MergeType::REGULAR: + return REGULAR_MERGE; + case MergeType::TTL_DELETE: + return TTL_DELETE_MERGE; + case MergeType::TTL_RECOMPRESS: + return TTL_RECOMPRESS_MERGE; + } + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unknown MergeType {}", static_cast(merge_type)); +} + NamesAndTypesList PartLogElement::getNamesAndTypes() { auto event_type_datatype = std::make_shared( @@ -30,11 +49,22 @@ NamesAndTypesList PartLogElement::getNamesAndTypes() } ); + auto merge_reason_datatype = std::make_shared( + DataTypeEnum8::Values + { + {"NotAMerge", static_cast(NOT_A_MERGE)}, + {"RegularMerge", static_cast(REGULAR_MERGE)}, + {"TTLDeleteMerge", static_cast(TTL_DELETE_MERGE)}, + {"TTLRecompressMerge", static_cast(TTL_RECOMPRESS_MERGE)}, + } + ); + ColumnsWithTypeAndName columns_with_type_and_name; return { {"query_id", std::make_shared()}, {"event_type", std::move(event_type_datatype)}, + {"merge_reason", std::move(merge_reason_datatype)}, {"event_date", std::make_shared()}, {"event_time", std::make_shared()}, @@ -72,6 +102,7 @@ void PartLogElement::appendToBlock(MutableColumns & columns) const columns[i++]->insert(query_id); columns[i++]->insert(event_type); + columns[i++]->insert(merge_reason); columns[i++]->insert(DateLUT::instance().toDayNum(event_time).toUnderType()); columns[i++]->insert(event_time); columns[i++]->insert(event_time_microseconds); diff --git a/src/Interpreters/PartLog.h b/src/Interpreters/PartLog.h index 470dce09fa0..16a7e37ee9d 100644 --- a/src/Interpreters/PartLog.h +++ b/src/Interpreters/PartLog.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB @@ -21,9 +22,22 @@ struct PartLogElement MOVE_PART = 6, }; + enum MergeReasonType + { + /// merge_reason is relevant only for event_type = 'MERGE_PARTS', in other cases it is NOT_A_MERGE + NOT_A_MERGE = 1, + /// Just regular merge + REGULAR_MERGE = 2, + /// Merge assigned to delete some data from parts (with TTLMergeSelector) + TTL_DELETE_MERGE = 3, + /// Merge with recompression + TTL_RECOMPRESS_MERGE = 4, + }; + String query_id; Type event_type = NEW_PART; + MergeReasonType merge_reason = NOT_A_MERGE; time_t event_time = 0; Decimal64 event_time_microseconds = 0; @@ -57,6 +71,7 @@ struct PartLogElement static std::string name() { return "PartLog"; } + static MergeReasonType getMergeReasonType(MergeType merge_type); static NamesAndTypesList getNamesAndTypes(); static NamesAndAliases getNamesAndAliases() { return {}; } void appendToBlock(MutableColumns & columns) const; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 4eb4049be60..c4c99e66873 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6176,6 +6176,10 @@ try part_log_elem.event_type = type; + if (part_log_elem.event_type == PartLogElement::MERGE_PARTS) + if (merge_entry) + part_log_elem.merge_reason = PartLogElement::getMergeReasonType((*merge_entry)->merge_type); + part_log_elem.error = static_cast(execution_status.code); part_log_elem.exception = execution_status.message; diff --git a/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference b/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql new file mode 100644 index 00000000000..db1f4c26af4 --- /dev/null +++ b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql @@ -0,0 +1,37 @@ +DROP TABLE IF EXISTS t_part_log_has_merge_type_table; + +CREATE TABLE t_part_log_has_merge_type_table +( + event_time DateTime, + UserID UInt64, + Comment String +) +ENGINE = MergeTree() +ORDER BY tuple() +TTL event_time + toIntervalMonth(3) +SETTINGS min_bytes_for_wide_part = 0, merge_with_ttl_timeout = 1; + +INSERT INTO t_part_log_has_merge_type_table VALUES (now(), 1, 'username1'); +INSERT INTO t_part_log_has_merge_type_table VALUES (now() - INTERVAL 4 MONTH, 2, 'username2'); + +OPTIMIZE TABLE t_part_log_has_merge_type_table FINAL; + +SYSTEM FLUSH LOGS; + +SELECT count(*) +FROM +( + SELECT + metadata_modification_time, + event_time + FROM system.tables AS l + INNER JOIN system.part_log AS r + ON l.name = r.table + WHERE (l.database = currentDatabase()) AND + (l.name = 't_part_log_has_merge_type_table') AND + (r.event_type = 'MergeParts') AND + (r.merge_reason = 'TTLDeleteMerge') +) +WHERE (metadata_modification_time <= event_time); + +DROP TABLE t_part_log_has_merge_type_table; From 16dc3ed97d29a17d34d9b8090ecbb070ebab3424 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 4 May 2022 20:16:42 +0300 Subject: [PATCH 54/69] FR: Expose what triggered the merge in system.part_log #26255 --- docs/en/operations/system-tables/part_log.md | 6 +++++ src/Interpreters/PartLog.cpp | 9 ++++--- .../02293_part_log_has_merge_reason.reference | 2 +- .../02293_part_log_has_merge_reason.sql | 26 +++++++------------ 4 files changed, 21 insertions(+), 22 deletions(-) diff --git a/docs/en/operations/system-tables/part_log.md b/docs/en/operations/system-tables/part_log.md index 00eaca23862..1b567367c97 100644 --- a/docs/en/operations/system-tables/part_log.md +++ b/docs/en/operations/system-tables/part_log.md @@ -14,6 +14,11 @@ The `system.part_log` table contains the following columns: - `REMOVE_PART` — Removing or detaching a data part using [DETACH PARTITION](../../sql-reference/statements/alter/partition.md#alter_detach-partition). - `MUTATE_PART` — Mutating of a data part. - `MOVE_PART` — Moving the data part from the one disk to another one. +- `merge_reason` ([Enum8](../../sql-reference/data-types/enum.md)) — The reason for the event with type `MERGE_PARTS`. Can have one of the following values: + - `NOT_A_MERGE` — The current event has the type other than `MERGE_PARTS`. + - `REGULAR_MERGE` — Some regular merge. + - `TTL_DELETE_MERGE` — Cleaning up expired data. + - `TTL_RECOMPRESS_MERGE` — Recompressing data part with the. - `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date. - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time. - `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds precision. @@ -46,6 +51,7 @@ Row 1: ────── query_id: 983ad9c7-28d5-4ae1-844e-603116b7de31 event_type: NewPart +merge_reason: NotAMerge event_date: 2021-02-02 event_time: 2021-02-02 11:14:28 event_time_microseconds: 2021-02-02 11:14:28.861919 diff --git a/src/Interpreters/PartLog.cpp b/src/Interpreters/PartLog.cpp index 274fc7384ab..13b74f3d00a 100644 --- a/src/Interpreters/PartLog.cpp +++ b/src/Interpreters/PartLog.cpp @@ -21,14 +21,15 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -PartLogElement::MergeReasonType PartLogElement::getMergeReasonType(MergeType merge_type) { +PartLogElement::MergeReasonType PartLogElement::getMergeReasonType(MergeType merge_type) +{ switch (merge_type) { - case MergeType::REGULAR: + case MergeType::Regular: return REGULAR_MERGE; - case MergeType::TTL_DELETE: + case MergeType::TTLDelete: return TTL_DELETE_MERGE; - case MergeType::TTL_RECOMPRESS: + case MergeType::TTLRecompress: return TTL_RECOMPRESS_MERGE; } diff --git a/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference b/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference index d00491fd7e5..220107cf15b 100644 --- a/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference +++ b/tests/queries/0_stateless/02293_part_log_has_merge_reason.reference @@ -1 +1 @@ -1 +MergeParts TTLDeleteMerge diff --git a/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql index db1f4c26af4..7ef86354e71 100644 --- a/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql +++ b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql @@ -8,30 +8,22 @@ CREATE TABLE t_part_log_has_merge_type_table ) ENGINE = MergeTree() ORDER BY tuple() -TTL event_time + toIntervalMonth(3) -SETTINGS min_bytes_for_wide_part = 0, merge_with_ttl_timeout = 1; +SETTINGS min_bytes_for_wide_part = 0, materialize_ttl_recalculate_only = true; INSERT INTO t_part_log_has_merge_type_table VALUES (now(), 1, 'username1'); INSERT INTO t_part_log_has_merge_type_table VALUES (now() - INTERVAL 4 MONTH, 2, 'username2'); +ALTER TABLE t_part_log_has_merge_type_table + MODIFY TTL event_time + INTERVAL 3 MONTH; + OPTIMIZE TABLE t_part_log_has_merge_type_table FINAL; SYSTEM FLUSH LOGS; -SELECT count(*) -FROM -( - SELECT - metadata_modification_time, - event_time - FROM system.tables AS l - INNER JOIN system.part_log AS r - ON l.name = r.table - WHERE (l.database = currentDatabase()) AND - (l.name = 't_part_log_has_merge_type_table') AND - (r.event_type = 'MergeParts') AND - (r.merge_reason = 'TTLDeleteMerge') -) -WHERE (metadata_modification_time <= event_time); +SELECT + event_type, + merge_reason +FROM system.part_log +WHERE (table = 't_part_log_has_merge_type_table') AND (merge_reason = 'TTLDeleteMerge'); DROP TABLE t_part_log_has_merge_type_table; From b3b3d7a45950d04d630e9930b7da2c331f604e4a Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 1 Jun 2022 16:57:28 +0200 Subject: [PATCH 55/69] Fix test --- .../02293_part_log_has_merge_reason.sql | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql index 7ef86354e71..002bc1f37dd 100644 --- a/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql +++ b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql @@ -8,14 +8,12 @@ CREATE TABLE t_part_log_has_merge_type_table ) ENGINE = MergeTree() ORDER BY tuple() +TTL event_time + INTERVAL 3 MONTH SETTINGS min_bytes_for_wide_part = 0, materialize_ttl_recalculate_only = true; INSERT INTO t_part_log_has_merge_type_table VALUES (now(), 1, 'username1'); INSERT INTO t_part_log_has_merge_type_table VALUES (now() - INTERVAL 4 MONTH, 2, 'username2'); -ALTER TABLE t_part_log_has_merge_type_table - MODIFY TTL event_time + INTERVAL 3 MONTH; - OPTIMIZE TABLE t_part_log_has_merge_type_table FINAL; SYSTEM FLUSH LOGS; @@ -23,7 +21,13 @@ SYSTEM FLUSH LOGS; SELECT event_type, merge_reason -FROM system.part_log -WHERE (table = 't_part_log_has_merge_type_table') AND (merge_reason = 'TTLDeleteMerge'); +FROM + system.part_log +WHERE + table = 't_part_log_has_merge_type_table' + AND + merge_reason = 'TTLDeleteMerge' + AND + database = currentDatabase(); DROP TABLE t_part_log_has_merge_type_table; From 7ef02a2e440b8b6db53745a327d4414e7a68f9ff Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 1 Jun 2022 15:32:33 +0000 Subject: [PATCH 56/69] Fix possible logical error in values table function --- src/Interpreters/evaluateConstantExpression.cpp | 14 ++++++++++++-- .../02316_values_table_func_bug.reference | 2 ++ .../0_stateless/02316_values_table_func_bug.sql | 2 ++ 3 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02316_values_table_func_bug.reference create mode 100644 tests/queries/0_stateless/02316_values_table_func_bug.sql diff --git a/src/Interpreters/evaluateConstantExpression.cpp b/src/Interpreters/evaluateConstantExpression.cpp index f5ad0337629..e9110d00128 100644 --- a/src/Interpreters/evaluateConstantExpression.cpp +++ b/src/Interpreters/evaluateConstantExpression.cpp @@ -31,11 +31,21 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +static std::pair> getFieldAndDataTypeFromLiteral(ASTLiteral * literal) +{ + auto type = applyVisitor(FieldToDataType(), literal->value); + /// In case of Array field nested fields can have different types. + /// Example: Array [1, 2.3] will have 2 fields with types UInt64 and Float64 + /// when result type is Array(Float64). + /// So, we need to convert this field to the result type. + Field res = convertFieldToType(literal->value, *type); + return {res, type}; +} std::pair> evaluateConstantExpression(const ASTPtr & node, ContextPtr context) { if (ASTLiteral * literal = node->as()) - return std::make_pair(literal->value, applyVisitor(FieldToDataType(), literal->value)); + return getFieldAndDataTypeFromLiteral(literal); NamesAndTypesList source_columns = {{ "_dummy", std::make_shared() }}; @@ -63,7 +73,7 @@ std::pair> evaluateConstantExpression(co /// AST potentially could be transformed to literal during TreeRewriter analyze. /// For example if we have SQL user defined function that return literal AS subquery. if (ASTLiteral * literal = ast->as()) - return std::make_pair(literal->value, applyVisitor(FieldToDataType(), literal->value)); + return getFieldAndDataTypeFromLiteral(literal); ExpressionActionsPtr expr_for_constant_folding = ExpressionAnalyzer(ast, syntax_result, context).getConstActions(); diff --git a/tests/queries/0_stateless/02316_values_table_func_bug.reference b/tests/queries/0_stateless/02316_values_table_func_bug.reference new file mode 100644 index 00000000000..63f5d8d96c8 --- /dev/null +++ b/tests/queries/0_stateless/02316_values_table_func_bug.reference @@ -0,0 +1,2 @@ +[1,2.2] +[[1,2,3],[1.1,2.2,3.3]] diff --git a/tests/queries/0_stateless/02316_values_table_func_bug.sql b/tests/queries/0_stateless/02316_values_table_func_bug.sql new file mode 100644 index 00000000000..7c66cf125e1 --- /dev/null +++ b/tests/queries/0_stateless/02316_values_table_func_bug.sql @@ -0,0 +1,2 @@ +select * from values([1, 2.2]); +select * from values([[1, 2, 3], [1.1, 2.2, 3.3]]); From 1ef48c3a4afe1c57ad14e678efc5ab3958dc010e Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Wed, 1 Jun 2022 15:42:12 +0000 Subject: [PATCH 57/69] turn on setting output_format_json_named_tuples_as_objects by default --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index c241955b268..320386c9bfe 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -695,7 +695,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, output_format_json_quote_denormals, false, "Enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format.", 0) \ \ M(Bool, output_format_json_escape_forward_slashes, true, "Controls escaping forward slashes for string outputs in JSON output format. This is intended for compatibility with JavaScript. Don't confuse with backslashes that are always escaped.", 0) \ - M(Bool, output_format_json_named_tuples_as_objects, false, "Serialize named tuple columns as JSON objects.", 0) \ + M(Bool, output_format_json_named_tuples_as_objects, true, "Serialize named tuple columns as JSON objects.", 0) \ M(Bool, output_format_json_array_of_rows, false, "Output a JSON array of all rows in JSONEachRow(Compact) format.", 0) \ \ M(UInt64, output_format_pretty_max_rows, 10000, "Rows limit for Pretty formats.", 0) \ From 895a96de95e5b6346cce16f18e7915bd9234ad1b Mon Sep 17 00:00:00 2001 From: bkuschel Date: Wed, 1 Jun 2022 09:44:37 -0700 Subject: [PATCH 58/69] Remove recursive --- .github/workflows/nightly.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 3ebf58b858a..e712ada1551 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -101,8 +101,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync --recursive - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" From 4abfd54dd62f062ab9593a2f297a30a7d8b174cf Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 1 Jun 2022 16:53:37 +0000 Subject: [PATCH 59/69] Fix possible segfault in schema inference --- src/Formats/ReadSchemaUtils.cpp | 21 +++++++++++-------- ...18_template_schema_inference_bug.reference | 0 .../02318_template_schema_inference_bug.sql | 2 ++ 3 files changed, 14 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/02318_template_schema_inference_bug.reference create mode 100755 tests/queries/0_stateless/02318_template_schema_inference_bug.sql diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 035546031d8..11a91bd50dc 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -100,18 +100,21 @@ ColumnsDescription readSchemaFromFormat( catch (...) { auto exception_message = getCurrentExceptionMessage(false); - size_t rows_read = schema_reader->getNumRowsRead(); - assert(rows_read <= max_rows_to_read); - max_rows_to_read -= schema_reader->getNumRowsRead(); - if (rows_read != 0 && max_rows_to_read == 0) + if (schema_reader) { - exception_message += "\nTo increase the maximum number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference"; - if (iterations > 1) + size_t rows_read = schema_reader->getNumRowsRead(); + assert(rows_read <= max_rows_to_read); + max_rows_to_read -= schema_reader->getNumRowsRead(); + if (rows_read != 0 && max_rows_to_read == 0) { - exception_messages += "\n" + exception_message; - break; + exception_message += "\nTo increase the maximum number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference"; + if (iterations > 1) + { + exception_messages += "\n" + exception_message; + break; + } + retry = false; } - retry = false; } if (!retry || !isRetryableSchemaInferenceError(getCurrentExceptionCode())) diff --git a/tests/queries/0_stateless/02318_template_schema_inference_bug.reference b/tests/queries/0_stateless/02318_template_schema_inference_bug.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02318_template_schema_inference_bug.sql b/tests/queries/0_stateless/02318_template_schema_inference_bug.sql new file mode 100755 index 00000000000..42646013dd5 --- /dev/null +++ b/tests/queries/0_stateless/02318_template_schema_inference_bug.sql @@ -0,0 +1,2 @@ +insert into function file(data_02318.tsv) select * from numbers(10); +desc file('data_02318.tsv', 'Template') SETTINGS format_template_row='nonexist', format_template_resultset='nonexist'; -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} From 06f80770b8492c2e4cc5c5bd3e302c8721ebac46 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 1 Jun 2022 20:11:53 +0200 Subject: [PATCH 60/69] fix stuck REPALCE_RANGE --- .../MergeTree/ReplicatedMergeTreeQueue.cpp | 23 +++++++++++++++++++ .../MergeTree/ReplicatedMergeTreeQueue.h | 4 ++++ src/Storages/StorageReplicatedMergeTree.cpp | 7 ++++++ 3 files changed, 34 insertions(+) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index fbdb1dabd88..9f679f121b8 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -2183,6 +2183,29 @@ bool ReplicatedMergeTreeMergePredicate::canMergeSinglePart( } +bool ReplicatedMergeTreeMergePredicate::partParticipatesInReplaceRange(const MergeTreeData::DataPartPtr & part, String * out_reason) const +{ + std::lock_guard lock(queue.state_mutex); + for (const auto & entry : queue.queue) + { + if (entry->type != ReplicatedMergeTreeLogEntry::REPLACE_RANGE) + continue; + + for (const auto & part_name : entry->replace_range_entry->new_part_names) + { + if (part->info.isDisjoint(MergeTreePartInfo::fromPartName(part_name, queue.format_version))) + continue; + + if (out_reason) + *out_reason = fmt::format("Part {} participates in REPLACE_RANGE {} ({})", part_name, entry->new_part_name, entry->znode_name); + + return true; + } + } + return false; +} + + std::optional> ReplicatedMergeTreeMergePredicate::getDesiredMutationVersion(const MergeTreeData::DataPartPtr & part) const { /// Assigning mutations is easier than assigning merges because mutations appear in the same order as diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h index 0c0e872b0ac..dea4d0573db 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h @@ -501,6 +501,10 @@ public: /// This predicate is checked for the first part of each range. bool canMergeSinglePart(const MergeTreeData::DataPartPtr & part, String * out_reason) const; + /// Returns true if part is needed for some REPLACE_RANGE entry. + /// We should not drop part in this case, because replication queue may stuck without that part. + bool partParticipatesInReplaceRange(const MergeTreeData::DataPartPtr & part, String * out_reason) const; + /// Return nonempty optional of desired mutation version and alter version. /// If we have no alter (modify/drop) mutations in mutations queue, than we return biggest possible /// mutation version (and -1 as alter version). In other case, we return biggest mutation version with diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 1d4b22d4a59..07eb8b18765 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7128,6 +7128,13 @@ bool StorageReplicatedMergeTree::dropPartImpl( return false; } + if (merge_pred.partParticipatesInReplaceRange(part, &out_reason)) + { + if (throw_if_noop) + throw Exception(ErrorCodes::PART_IS_TEMPORARILY_LOCKED, out_reason); + return false; + } + if (partIsLastQuorumPart(part->info)) { if (throw_if_noop) From 663261673381d392a6a00753c165de84f5629827 Mon Sep 17 00:00:00 2001 From: lthaooo Date: Thu, 2 Jun 2022 03:09:53 +0800 Subject: [PATCH 61/69] Fix TTL merge scheduling bug (#36387) --- src/Storages/MergeTree/BackgroundJobsAssignee.cpp | 3 ++- src/Storages/MergeTree/BackgroundJobsAssignee.h | 2 +- src/Storages/MergeTree/MergeList.h | 5 +++++ src/Storages/StorageMergeTree.cpp | 8 ++++++-- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/Storages/MergeTree/BackgroundJobsAssignee.cpp b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp index 4dc15d6e794..81445f40ed6 100644 --- a/src/Storages/MergeTree/BackgroundJobsAssignee.cpp +++ b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp @@ -50,10 +50,11 @@ void BackgroundJobsAssignee::postpone() } -void BackgroundJobsAssignee::scheduleMergeMutateTask(ExecutableTaskPtr merge_task) +bool BackgroundJobsAssignee::scheduleMergeMutateTask(ExecutableTaskPtr merge_task) { bool res = getContext()->getMergeMutateExecutor()->trySchedule(merge_task); res ? trigger() : postpone(); + return res; } diff --git a/src/Storages/MergeTree/BackgroundJobsAssignee.h b/src/Storages/MergeTree/BackgroundJobsAssignee.h index e6c5845c657..db93b5f710b 100644 --- a/src/Storages/MergeTree/BackgroundJobsAssignee.h +++ b/src/Storages/MergeTree/BackgroundJobsAssignee.h @@ -66,7 +66,7 @@ public: void postpone(); void finish(); - void scheduleMergeMutateTask(ExecutableTaskPtr merge_task); + bool scheduleMergeMutateTask(ExecutableTaskPtr merge_task); void scheduleFetchTask(ExecutableTaskPtr fetch_task); void scheduleMoveTask(ExecutableTaskPtr move_task); void scheduleCommonTask(ExecutableTaskPtr common_task, bool need_trigger); diff --git a/src/Storages/MergeTree/MergeList.h b/src/Storages/MergeTree/MergeList.h index a944779ad44..ac1db503d9b 100644 --- a/src/Storages/MergeTree/MergeList.h +++ b/src/Storages/MergeTree/MergeList.h @@ -197,6 +197,11 @@ public: ++merges_with_ttl_counter; } + void cancelMergeWithTTL() + { + --merges_with_ttl_counter; + } + size_t getMergesWithTTLCount() const { return merges_with_ttl_counter; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 2cb62801fd5..81b61909228 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1168,8 +1168,12 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign { auto task = std::make_shared(*this, metadata_snapshot, false, Names{}, merge_entry, share_lock, common_assignee_trigger); task->setCurrentTransaction(std::move(transaction_for_merge), std::move(txn)); - assignee.scheduleMergeMutateTask(task); - return true; + bool scheduled = assignee.scheduleMergeMutateTask(task); + /// The problem that we already booked a slot for TTL merge, but a merge list entry will be created only in a prepare method + /// in MergePlainMergeTreeTask. So, this slot will never be freed. + if (!scheduled && isTTLMergeType(merge_entry->future_part->merge_type)) + getContext()->getMergeList().cancelMergeWithTTL(); + return scheduled; } if (mutate_entry) { From ec6e413f0bae58b2dd0cebb5babd1182a0f9d60e Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Wed, 1 Jun 2022 23:00:49 +0200 Subject: [PATCH 62/69] Fixed runtime check for AVX512F --- src/Common/CpuId.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/CpuId.h b/src/Common/CpuId.h index 5037c687943..167fa22faf6 100644 --- a/src/Common/CpuId.h +++ b/src/Common/CpuId.h @@ -221,7 +221,7 @@ bool haveAVX512F() noexcept && (our_xgetbv(0) & 6u) == 6u // XMM state and YMM state are enabled by OS && ((our_xgetbv(0) >> 5) & 7u) == 7u // ZMM state is enabled by OS && CpuInfo(0x0).registers.eax >= 0x7 // leaf 7 is present - && ((CpuInfo(0x7).registers.ebx >> 16) & 1u); // AVX512F bit + && ((CpuInfo(0x7, 0).registers.ebx >> 16) & 1u); // AVX512F bit #else return false; #endif From 5fcf8401562bc67cf0a4ed6fd197c5283fe07c34 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 2 Jun 2022 08:43:44 +0300 Subject: [PATCH 63/69] Typo. --- src/IO/S3Common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 98471f5b81f..327730d9740 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -45,7 +45,7 @@ public: const String & force_region, const RemoteHostFilter & remote_host_filter, unsigned int s3_max_redirects, - bool enable_s3_requestrs_logging); + bool enable_s3_requests_logging); private: ClientFactory(); From a857bc2ccff315aeb16f1a4106789164cab8034e Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 2 Jun 2022 08:46:41 +0300 Subject: [PATCH 64/69] Update S3Common.cpp --- src/IO/S3Common.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index c283afb21e4..fc4ab707026 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -765,9 +765,9 @@ namespace S3 const String & force_region, const RemoteHostFilter & remote_host_filter, unsigned int s3_max_redirects, - bool enable_s3_requestrs_logging) + bool enable_s3_requests_logging) { - return PocoHTTPClientConfiguration(force_region, remote_host_filter, s3_max_redirects, enable_s3_requestrs_logging); + return PocoHTTPClientConfiguration(force_region, remote_host_filter, s3_max_redirects, enable_s3_requests_logging); } URI::URI(const Poco::URI & uri_) From eef6a5ec9684b057708bdb23e690e7da5fffd8fc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 2 Jun 2022 09:41:12 +0300 Subject: [PATCH 65/69] Revert "Remove resursive submodules" --- .github/workflows/master.yml | 64 +++++++++++++++--------------- .github/workflows/pull_request.yml | 64 +++++++++++++++--------------- .gitmodules | 6 +-- contrib/arrow | 2 +- contrib/brotli | 2 +- contrib/cppkafka | 2 +- contrib/msgpack-c | 2 +- contrib/rapidjson | 2 +- contrib/snappy | 2 +- utils/check-style/check-style | 3 -- 10 files changed, 73 insertions(+), 76 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index e0954aab236..c890488ea80 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -215,8 +215,8 @@ jobs: fetch-depth: 0 # For a proper version and performance artifacts - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -259,8 +259,8 @@ jobs: fetch-depth: 0 # For a proper version and performance artifacts - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -305,8 +305,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -350,8 +350,8 @@ jobs: # uses: actions/checkout@v2 # - name: Build # run: | - # git -C "$GITHUB_WORKSPACE" submodule sync - # git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + # git -C "$GITHUB_WORKSPACE" submodule sync --recursive + # git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 # sudo rm -fr "$TEMP_PATH" # mkdir -p "$TEMP_PATH" # cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -395,8 +395,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -440,8 +440,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -485,8 +485,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -530,8 +530,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -575,8 +575,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -623,8 +623,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -668,8 +668,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -715,8 +715,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -762,8 +762,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -809,8 +809,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -856,8 +856,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -903,8 +903,8 @@ jobs: fetch-depth: 0 # otherwise we will have no info about contributors - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index f6e9880d088..76a26d685c5 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -277,8 +277,8 @@ jobs: fetch-depth: 0 # for performance artifact - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -322,8 +322,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -367,8 +367,8 @@ jobs: # uses: actions/checkout@v2 # - name: Build # run: | - # git -C "$GITHUB_WORKSPACE" submodule sync - # git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + # git -C "$GITHUB_WORKSPACE" submodule sync --recursive + # git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 # sudo rm -fr "$TEMP_PATH" # mkdir -p "$TEMP_PATH" # cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -414,8 +414,8 @@ jobs: fetch-depth: 0 # for performance artifact - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -459,8 +459,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -504,8 +504,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -549,8 +549,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -594,8 +594,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -639,8 +639,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -687,8 +687,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -732,8 +732,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -777,8 +777,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -822,8 +822,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -867,8 +867,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -912,8 +912,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" @@ -957,8 +957,8 @@ jobs: uses: actions/checkout@v2 - name: Build run: | - git -C "$GITHUB_WORKSPACE" submodule sync - git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10 + git -C "$GITHUB_WORKSPACE" submodule sync --recursive + git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" diff --git a/.gitmodules b/.gitmodules index aa68aa218b5..55fd684fddb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -79,10 +79,10 @@ url = https://github.com/ClickHouse/snappy.git [submodule "contrib/cppkafka"] path = contrib/cppkafka - url = https://github.com/ClickHouse/cppkafka.git + url = https://github.com/mfontanini/cppkafka.git [submodule "contrib/brotli"] path = contrib/brotli - url = https://github.com/ClickHouse/brotli.git + url = https://github.com/google/brotli.git [submodule "contrib/h3"] path = contrib/h3 url = https://github.com/ClickHouse/h3 @@ -144,7 +144,7 @@ ignore = untracked [submodule "contrib/msgpack-c"] path = contrib/msgpack-c - url = https://github.com/ClickHouse/msgpack-c + url = https://github.com/msgpack/msgpack-c [submodule "contrib/libcpuid"] path = contrib/libcpuid url = https://github.com/ClickHouse/libcpuid.git diff --git a/contrib/arrow b/contrib/arrow index 6f274b737c6..efdcd015cfd 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 6f274b737c66a6c39bab0d3bdf6cf7d139ef06f5 +Subproject commit efdcd015cfdee1b6aa349c9ca227ca12c3d697f5 diff --git a/contrib/brotli b/contrib/brotli index 5bd78768449..63be8a99401 160000 --- a/contrib/brotli +++ b/contrib/brotli @@ -1 +1 @@ -Subproject commit 5bd78768449751a78d4b4c646b0612917986f5b1 +Subproject commit 63be8a99401992075c23e99f7c84de1c653e39e2 diff --git a/contrib/cppkafka b/contrib/cppkafka index 64bd67db12b..5a119f689f8 160000 --- a/contrib/cppkafka +++ b/contrib/cppkafka @@ -1 +1 @@ -Subproject commit 64bd67db12b9c705e9127439a5b05b351d9df7da +Subproject commit 5a119f689f8a4d90d10a9635e7ee2bee5c127de1 diff --git a/contrib/msgpack-c b/contrib/msgpack-c index 790b3fe58eb..46684265d50 160000 --- a/contrib/msgpack-c +++ b/contrib/msgpack-c @@ -1 +1 @@ -Subproject commit 790b3fe58ebded7a8bd130782ef28bec5784c248 +Subproject commit 46684265d50b5d1b062d4c5c428ba08462844b1d diff --git a/contrib/rapidjson b/contrib/rapidjson index b571bd5c1a3..c4ef90ccdbc 160000 --- a/contrib/rapidjson +++ b/contrib/rapidjson @@ -1 +1 @@ -Subproject commit b571bd5c1a3b1fc931d77ae36932537a3c9018c3 +Subproject commit c4ef90ccdbc21d5d5a628d08316bfd301e32d6fa diff --git a/contrib/snappy b/contrib/snappy index 3786173af20..fb057edfed8 160000 --- a/contrib/snappy +++ b/contrib/snappy @@ -1 +1 @@ -Subproject commit 3786173af204d21da97180977ad6ab4321138b3d +Subproject commit fb057edfed820212076239fd32cb2ff23e9016bf diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 406b36e9251..84ce7ae5742 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -340,6 +340,3 @@ fi # Forbid files that differ only by character case find $ROOT_PATH | sort -f | uniq -i -c | awk '{ if ($1 > 1) print }' - -# Forbid recursive submodules -find $ROOT_PATH/contrib -name '.gitmodules' -size +0 | xargs cat | grep -P '.' && echo "Recursive submodules are forbidden." From 2b2232c264e655df20d92d29ad98447a182858ac Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Thu, 2 Jun 2022 06:42:29 -0300 Subject: [PATCH 66/69] test for #36995 (#37668) --- .../02312_is_not_null_prewhere.reference | 3 +++ .../02312_is_not_null_prewhere.sql | 21 +++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 tests/queries/0_stateless/02312_is_not_null_prewhere.reference create mode 100644 tests/queries/0_stateless/02312_is_not_null_prewhere.sql diff --git a/tests/queries/0_stateless/02312_is_not_null_prewhere.reference b/tests/queries/0_stateless/02312_is_not_null_prewhere.reference new file mode 100644 index 00000000000..bdaa7374c1b --- /dev/null +++ b/tests/queries/0_stateless/02312_is_not_null_prewhere.reference @@ -0,0 +1,3 @@ +2022-01-01 00:00:00 1 +2022-01-01 00:00:00 1 +2022-01-01 00:00:00 1 diff --git a/tests/queries/0_stateless/02312_is_not_null_prewhere.sql b/tests/queries/0_stateless/02312_is_not_null_prewhere.sql new file mode 100644 index 00000000000..56371d0ec6c --- /dev/null +++ b/tests/queries/0_stateless/02312_is_not_null_prewhere.sql @@ -0,0 +1,21 @@ +DROP TABLE IF EXISTS bug_36995; + +CREATE TABLE bug_36995( + `time` DateTime, + `product` String) +ENGINE = MergeTree +ORDER BY time AS +SELECT '2022-01-01 00:00:00','1'; + +SELECT * FROM bug_36995 +WHERE (time IS NOT NULL) AND (product IN (SELECT '1')) +SETTINGS optimize_move_to_prewhere = 1; + +SELECT * FROM bug_36995 +WHERE (time IS NOT NULL) AND (product IN (SELECT '1')) +SETTINGS optimize_move_to_prewhere = 0; + +SELECT * FROM bug_36995 +PREWHERE (time IS NOT NULL) WHERE (product IN (SELECT '1')); + +DROP TABLE bug_36995; From d34e051c69d3acaefc8828d00b925af142ecef40 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Thu, 2 Jun 2022 11:46:33 +0200 Subject: [PATCH 67/69] Support for simultaneous read from local and remote parallel replica (#37204) --- src/Interpreters/Cluster.cpp | 10 + src/Interpreters/Cluster.h | 1 + .../ClusterProxy/IStreamFactory.h | 65 ------ .../ClusterProxy/SelectStreamFactory.cpp | 104 +++++++++- .../ClusterProxy/SelectStreamFactory.h | 59 +++++- .../ClusterProxy/executeQuery.cpp | 92 ++++++++- src/Interpreters/ClusterProxy/executeQuery.h | 15 +- src/Interpreters/InterpreterSelectQuery.cpp | 7 +- src/Interpreters/InterpreterSelectQuery.h | 2 +- .../QueryPlan/DistributedCreateLocalPlan.cpp | 4 +- .../QueryPlan/DistributedCreateLocalPlan.h | 2 + src/Processors/QueryPlan/ReadFromRemote.cpp | 192 +++++++++++------- src/Processors/QueryPlan/ReadFromRemote.h | 60 ++++-- .../ParallelReplicasReadingCoordinator.h | 2 + src/Storages/StorageDistributed.cpp | 27 ++- ..._parallel_processing_on_replicas_part_1.sh | 2 +- 16 files changed, 468 insertions(+), 176 deletions(-) delete mode 100644 src/Interpreters/ClusterProxy/IStreamFactory.h diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index d817988e7b6..54f55c7b1f6 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -418,6 +418,8 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, if (address.is_local) info.local_addresses.push_back(address); + info.all_addresses.push_back(address); + auto pool = ConnectionPoolFactory::instance().get( settings.distributed_connections_pool_size, address.host_name, address.port, @@ -485,6 +487,7 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, } Addresses shard_local_addresses; + Addresses shard_all_addresses; ConnectionPoolPtrs all_replicas_pools; all_replicas_pools.reserve(replica_addresses.size()); @@ -502,6 +505,7 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, all_replicas_pools.emplace_back(replica_pool); if (replica.is_local) shard_local_addresses.push_back(replica); + shard_all_addresses.push_back(replica); } ConnectionPoolWithFailoverPtr shard_pool = std::make_shared( @@ -516,6 +520,7 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, current_shard_num, weight, std::move(shard_local_addresses), + std::move(shard_all_addresses), std::move(shard_pool), std::move(all_replicas_pools), internal_replication @@ -571,6 +576,7 @@ Cluster::Cluster( addresses_with_failover.emplace_back(current); Addresses shard_local_addresses; + Addresses all_addresses; ConnectionPoolPtrs all_replicas; all_replicas.reserve(current.size()); @@ -585,6 +591,7 @@ Cluster::Cluster( all_replicas.emplace_back(replica_pool); if (replica.is_local && !treat_local_as_remote) shard_local_addresses.push_back(replica); + all_addresses.push_back(replica); } ConnectionPoolWithFailoverPtr shard_pool = std::make_shared( @@ -597,6 +604,7 @@ Cluster::Cluster( current_shard_num, default_weight, std::move(shard_local_addresses), + std::move(all_addresses), std::move(shard_pool), std::move(all_replicas), false // has_internal_replication @@ -680,6 +688,8 @@ Cluster::Cluster(Cluster::ReplicasAsShardsTag, const Cluster & from, const Setti if (address.is_local) info.local_addresses.push_back(address); + info.all_addresses.push_back(address); + auto pool = ConnectionPoolFactory::instance().get( settings.distributed_connections_pool_size, address.host_name, diff --git a/src/Interpreters/Cluster.h b/src/Interpreters/Cluster.h index 7c8d15d0350..5ce011782fc 100644 --- a/src/Interpreters/Cluster.h +++ b/src/Interpreters/Cluster.h @@ -202,6 +202,7 @@ public: UInt32 shard_num = 0; UInt32 weight = 1; Addresses local_addresses; + Addresses all_addresses; /// nullptr if there are no remote addresses ConnectionPoolWithFailoverPtr pool; /// Connection pool for each replica, contains nullptr for local replicas diff --git a/src/Interpreters/ClusterProxy/IStreamFactory.h b/src/Interpreters/ClusterProxy/IStreamFactory.h deleted file mode 100644 index 483ce9dcab9..00000000000 --- a/src/Interpreters/ClusterProxy/IStreamFactory.h +++ /dev/null @@ -1,65 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace DB -{ - -struct Settings; -class Cluster; -class Throttler; -struct SelectQueryInfo; - -class Pipe; -using Pipes = std::vector; - -class QueryPlan; -using QueryPlanPtr = std::unique_ptr; - -struct StorageID; - -namespace ClusterProxy -{ - -/// Base class for the implementation of the details of distributed query -/// execution that are specific to the query type. -class IStreamFactory -{ -public: - virtual ~IStreamFactory() = default; - - struct Shard - { - /// Query and header may be changed depending on shard. - ASTPtr query; - Block header; - - size_t shard_num = 0; - size_t num_replicas = 0; - ConnectionPoolWithFailoverPtr pool; - ConnectionPoolPtrs per_replica_pools; - - /// If we connect to replicas lazily. - /// (When there is a local replica with big delay). - bool lazy = false; - UInt32 local_delay = 0; - }; - - using Shards = std::vector; - - virtual void createForShard( - const Cluster::ShardInfo & shard_info, - const ASTPtr & query_ast, - const StorageID & main_table, - const ASTPtr & table_func_ptr, - ContextPtr context, - std::vector & local_plans, - Shards & remote_shards, - UInt32 shard_count) = 0; -}; - -} - -} diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp index a2afed3759f..89123cda531 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -10,14 +11,15 @@ #include #include +#include #include #include +#include #include #include #include #include - namespace ProfileEvents { extern const Event DistributedConnectionMissingTable; @@ -63,7 +65,8 @@ void SelectStreamFactory::createForShard( auto emplace_local_stream = [&]() { - local_plans.emplace_back(createLocalPlan(query_ast, header, context, processed_stage, shard_info.shard_num, shard_count, /*coordinator=*/nullptr)); + local_plans.emplace_back(createLocalPlan( + query_ast, header, context, processed_stage, shard_info.shard_num, shard_count, /*replica_num=*/0, /*replica_count=*/0, /*coordinator=*/nullptr)); }; auto emplace_remote_stream = [&](bool lazy = false, UInt32 local_delay = 0) @@ -71,10 +74,7 @@ void SelectStreamFactory::createForShard( remote_shards.emplace_back(Shard{ .query = query_ast, .header = header, - .shard_num = shard_info.shard_num, - .num_replicas = shard_info.getAllNodeCount(), - .pool = shard_info.pool, - .per_replica_pools = shard_info.per_replica_pools, + .shard_info = shard_info, .lazy = lazy, .local_delay = local_delay, }); @@ -173,5 +173,97 @@ void SelectStreamFactory::createForShard( emplace_remote_stream(); } + +SelectStreamFactory::ShardPlans SelectStreamFactory::createForShardWithParallelReplicas( + const Cluster::ShardInfo & shard_info, + const ASTPtr & query_ast, + const StorageID & main_table, + const ASTPtr & table_function_ptr, + const ThrottlerPtr & throttler, + ContextPtr context, + UInt32 shard_count) +{ + SelectStreamFactory::ShardPlans result; + + if (auto it = objects_by_shard.find(shard_info.shard_num); it != objects_by_shard.end()) + replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, query_ast); + + const auto & settings = context->getSettingsRef(); + + auto is_local_replica_obsolete = [&]() + { + auto resolved_id = context->resolveStorageID(main_table); + auto main_table_storage = DatabaseCatalog::instance().tryGetTable(resolved_id, context); + const auto * replicated_storage = dynamic_cast(main_table_storage.get()); + + if (!replicated_storage) + return false; + + UInt64 max_allowed_delay = settings.max_replica_delay_for_distributed_queries; + + if (!max_allowed_delay) + return false; + + UInt32 local_delay = replicated_storage->getAbsoluteDelay(); + return local_delay >= max_allowed_delay; + }; + + size_t next_replica_number = 0; + size_t all_replicas_count = shard_info.getRemoteNodeCount(); + + auto coordinator = std::make_shared(); + auto remote_plan = std::make_unique(); + + + if (settings.prefer_localhost_replica && shard_info.isLocal()) + { + /// We don't need more than one local replica in parallel reading + if (!is_local_replica_obsolete()) + { + ++all_replicas_count; + + result.local_plan = createLocalPlan( + query_ast, header, context, processed_stage, shard_info.shard_num, shard_count, next_replica_number, all_replicas_count, coordinator); + + ++next_replica_number; + } + } + + Scalars scalars = context->hasQueryContext() ? context->getQueryContext()->getScalars() : Scalars{}; + scalars.emplace( + "_shard_count", Block{{DataTypeUInt32().createColumnConst(1, shard_count), std::make_shared(), "_shard_count"}}); + auto external_tables = context->getExternalTables(); + + auto shard = Shard{ + .query = query_ast, + .header = header, + .shard_info = shard_info, + .lazy = false, + .local_delay = 0, + }; + + if (shard_info.hasRemoteConnections()) + { + auto read_from_remote = std::make_unique( + coordinator, + shard, + header, + processed_stage, + main_table, + table_function_ptr, + context, + throttler, + std::move(scalars), + std::move(external_tables), + &Poco::Logger::get("ReadFromParallelRemoteReplicasStep"), + shard_count); + + remote_plan->addStep(std::move(read_from_remote)); + result.remote_plan = std::move(remote_plan); + } + + return result; +} + } } diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.h b/src/Interpreters/ClusterProxy/SelectStreamFactory.h index 731bf3acd10..f64e57e1316 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h @@ -1,22 +1,56 @@ #pragma once #include -#include #include #include #include +#include +#include +#include namespace DB { + +struct Settings; +class Cluster; +class Throttler; +struct SelectQueryInfo; + +class Pipe; +using Pipes = std::vector; + +class QueryPlan; +using QueryPlanPtr = std::unique_ptr; + +struct StorageID; + namespace ClusterProxy { + using ColumnsDescriptionByShardNum = std::unordered_map; -class SelectStreamFactory final : public IStreamFactory +class SelectStreamFactory { public: + + struct Shard + { + /// Query and header may be changed depending on shard. + ASTPtr query; + Block header; + + Cluster::ShardInfo shard_info; + + /// If we connect to replicas lazily. + /// (When there is a local replica with big delay). + bool lazy = false; + UInt32 local_delay = 0; + }; + + using Shards = std::vector; + SelectStreamFactory( const Block & header_, const ColumnsDescriptionByShardNum & objects_by_shard_, @@ -31,7 +65,26 @@ public: ContextPtr context, std::vector & local_plans, Shards & remote_shards, - UInt32 shard_count) override; + UInt32 shard_count); + + struct ShardPlans + { + /// If a shard has local replicas this won't be nullptr + std::unique_ptr local_plan; + + /// Contains several steps to read from all remote replicas + std::unique_ptr remote_plan; + }; + + ShardPlans createForShardWithParallelReplicas( + const Cluster::ShardInfo & shard_info, + const ASTPtr & query_ast, + const StorageID & main_table, + const ASTPtr & table_function_ptr, + const ThrottlerPtr & throttler, + ContextPtr context, + UInt32 shard_count + ); private: const Block header; diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 3f1823fb171..e7a0e24fc7b 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -20,6 +20,7 @@ namespace DB namespace ErrorCodes { extern const int TOO_LARGE_DISTRIBUTED_DEPTH; + extern const int LOGICAL_ERROR; } namespace ClusterProxy @@ -106,21 +107,19 @@ void executeQuery( QueryProcessingStage::Enum processed_stage, const StorageID & main_table, const ASTPtr & table_func_ptr, - IStreamFactory & stream_factory, Poco::Logger * log, + SelectStreamFactory & stream_factory, Poco::Logger * log, const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info, const ExpressionActionsPtr & sharding_key_expr, const std::string & sharding_key_column_name, const ClusterPtr & not_optimized_cluster) { - assert(log); - const Settings & settings = context->getSettingsRef(); if (settings.max_distributed_depth && context->getClientInfo().distributed_depth >= settings.max_distributed_depth) throw Exception("Maximum distributed depth exceeded", ErrorCodes::TOO_LARGE_DISTRIBUTED_DEPTH); std::vector plans; - IStreamFactory::Shards remote_shards; + SelectStreamFactory::Shards remote_shards; auto new_context = updateSettingsForCluster(*query_info.getCluster(), context, settings, log); @@ -213,6 +212,91 @@ void executeQuery( query_plan.unitePlans(std::move(union_step), std::move(plans)); } + +void executeQueryWithParallelReplicas( + QueryPlan & query_plan, + const StorageID & main_table, + const ASTPtr & table_func_ptr, + SelectStreamFactory & stream_factory, + const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info, + const ExpressionActionsPtr & sharding_key_expr, + const std::string & sharding_key_column_name, + const ClusterPtr & not_optimized_cluster) +{ + const Settings & settings = context->getSettingsRef(); + + ThrottlerPtr user_level_throttler; + if (auto * process_list_element = context->getProcessListElement()) + user_level_throttler = process_list_element->getUserNetworkThrottler(); + + /// Network bandwidth limit, if needed. + ThrottlerPtr throttler; + if (settings.max_network_bandwidth || settings.max_network_bytes) + { + throttler = std::make_shared( + settings.max_network_bandwidth, + settings.max_network_bytes, + "Limit for bytes to send or receive over network exceeded.", + user_level_throttler); + } + else + throttler = user_level_throttler; + + + std::vector plans; + size_t shards = query_info.getCluster()->getShardCount(); + + for (const auto & shard_info : query_info.getCluster()->getShardsInfo()) + { + ASTPtr query_ast_for_shard; + if (query_info.optimized_cluster && settings.optimize_skip_unused_shards_rewrite_in && shards > 1) + { + query_ast_for_shard = query_ast->clone(); + + OptimizeShardingKeyRewriteInVisitor::Data visitor_data{ + sharding_key_expr, + sharding_key_expr->getSampleBlock().getByPosition(0).type, + sharding_key_column_name, + shard_info, + not_optimized_cluster->getSlotToShard(), + }; + OptimizeShardingKeyRewriteInVisitor visitor(visitor_data); + visitor.visit(query_ast_for_shard); + } + else + query_ast_for_shard = query_ast; + + auto shard_plans = stream_factory.createForShardWithParallelReplicas(shard_info, + query_ast_for_shard, main_table, table_func_ptr, throttler, context, shards); + + if (!shard_plans.local_plan && !shard_plans.remote_plan) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No plans were generated for reading from shard. This is a bug"); + + if (shard_plans.local_plan) + plans.emplace_back(std::move(shard_plans.local_plan)); + + if (shard_plans.remote_plan) + plans.emplace_back(std::move(shard_plans.remote_plan)); + } + + if (plans.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No plans were generated for reading from Distributed. This is a bug"); + + if (plans.size() == 1) + { + query_plan = std::move(*plans.front()); + return; + } + + DataStreams input_streams; + input_streams.reserve(plans.size()); + for (const auto & plan : plans) + input_streams.emplace_back(plan->getCurrentDataStream()); + + auto union_step = std::make_unique(std::move(input_streams)); + query_plan.unitePlans(std::move(union_step), std::move(plans)); +} + } } diff --git a/src/Interpreters/ClusterProxy/executeQuery.h b/src/Interpreters/ClusterProxy/executeQuery.h index d38bbe0fd5b..1a5035015a7 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.h +++ b/src/Interpreters/ClusterProxy/executeQuery.h @@ -23,7 +23,7 @@ struct StorageID; namespace ClusterProxy { -class IStreamFactory; +class SelectStreamFactory; /// Update settings for Distributed query. /// @@ -46,7 +46,18 @@ void executeQuery( QueryProcessingStage::Enum processed_stage, const StorageID & main_table, const ASTPtr & table_func_ptr, - IStreamFactory & stream_factory, Poco::Logger * log, + SelectStreamFactory & stream_factory, Poco::Logger * log, + const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info, + const ExpressionActionsPtr & sharding_key_expr, + const std::string & sharding_key_column_name, + const ClusterPtr & not_optimized_cluster); + + +void executeQueryWithParallelReplicas( + QueryPlan & query_plan, + const StorageID & main_table, + const ASTPtr & table_func_ptr, + SelectStreamFactory & stream_factory, const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info, const ExpressionActionsPtr & sharding_key_expr, const std::string & sharding_key_column_name, diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index fbafb98e0d8..a62533cfc8a 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1728,12 +1728,11 @@ void InterpreterSelectQuery::setMergeTreeReadTaskCallbackAndClientInfo(MergeTree context->setMergeTreeReadTaskCallback(std::move(callback)); } -void InterpreterSelectQuery::setProperClientInfo() +void InterpreterSelectQuery::setProperClientInfo(size_t replica_num, size_t replica_count) { context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - assert(options.shard_count.has_value() && options.shard_num.has_value()); - context->getClientInfo().count_participating_replicas = *options.shard_count; - context->getClientInfo().number_of_current_replica = *options.shard_num; + context->getClientInfo().count_participating_replicas = replica_count; + context->getClientInfo().number_of_current_replica = replica_num; } bool InterpreterSelectQuery::shouldMoveToPrewhere() diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index 8aee3c7c273..b7807a486b5 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -125,7 +125,7 @@ public: void setMergeTreeReadTaskCallbackAndClientInfo(MergeTreeReadTaskCallback && callback); /// It will set shard_num and shard_count to the client_info - void setProperClientInfo(); + void setProperClientInfo(size_t replica_num, size_t replica_count); private: InterpreterSelectQuery( diff --git a/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp b/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp index 854a677afb9..f91c8020509 100644 --- a/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp +++ b/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp @@ -41,6 +41,8 @@ std::unique_ptr createLocalPlan( QueryProcessingStage::Enum processed_stage, UInt32 shard_num, UInt32 shard_count, + size_t replica_num, + size_t replica_count, std::shared_ptr coordinator) { checkStackSize(); @@ -56,7 +58,7 @@ std::unique_ptr createLocalPlan( .setShardInfo(shard_num, shard_count) .ignoreASTOptimizations()); - interpreter.setProperClientInfo(); + interpreter.setProperClientInfo(replica_num, replica_count); if (coordinator) { interpreter.setMergeTreeReadTaskCallbackAndClientInfo([coordinator](PartitionReadRequest request) -> std::optional diff --git a/src/Processors/QueryPlan/DistributedCreateLocalPlan.h b/src/Processors/QueryPlan/DistributedCreateLocalPlan.h index fdfe1709833..b55cedf9871 100644 --- a/src/Processors/QueryPlan/DistributedCreateLocalPlan.h +++ b/src/Processors/QueryPlan/DistributedCreateLocalPlan.h @@ -15,6 +15,8 @@ std::unique_ptr createLocalPlan( QueryProcessingStage::Enum processed_stage, UInt32 shard_num, UInt32 shard_count, + size_t replica_num, + size_t replica_count, std::shared_ptr coordinator); } diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 35db2b1548a..867daaff30c 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -16,6 +16,8 @@ #include #include +#include + namespace DB { @@ -62,7 +64,7 @@ static String formattedAST(const ASTPtr & ast) } ReadFromRemote::ReadFromRemote( - ClusterProxy::IStreamFactory::Shards shards_, + ClusterProxy::SelectStreamFactory::Shards shards_, Block header_, QueryProcessingStage::Enum stage_, StorageID main_table_, @@ -87,10 +89,7 @@ ReadFromRemote::ReadFromRemote( { } -void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::IStreamFactory::Shard & shard, - std::shared_ptr coordinator, - std::shared_ptr pool, - std::optional replica_info) +void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard) { bool add_agg_info = stage == QueryProcessingStage::WithMergeableState; bool add_totals = false; @@ -103,10 +102,7 @@ void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::IStreamFacto } auto lazily_create_stream = [ - replica_info = replica_info, - pool = pool ? pool : shard.pool, - coordinator = coordinator, - shard_num = shard.shard_num, shard_count = shard_count, query = shard.query, header = shard.header, + shard = shard, shard_count = shard_count, query = shard.query, header = shard.header, context = context, throttler = throttler, main_table = main_table, table_func_ptr = table_func_ptr, scalars = scalars, external_tables = external_tables, @@ -122,15 +118,15 @@ void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::IStreamFacto try { if (table_func_ptr) - try_results = pool->getManyForTableFunction(timeouts, ¤t_settings, PoolMode::GET_MANY); + try_results = shard.shard_info.pool->getManyForTableFunction(timeouts, ¤t_settings, PoolMode::GET_MANY); else - try_results = pool->getManyChecked(timeouts, ¤t_settings, PoolMode::GET_MANY, main_table.getQualifiedName()); + try_results = shard.shard_info.pool->getManyChecked(timeouts, ¤t_settings, PoolMode::GET_MANY, main_table.getQualifiedName()); } catch (const Exception & ex) { if (ex.code() == ErrorCodes::ALL_CONNECTION_TRIES_FAILED) LOG_WARNING(&Poco::Logger::get("ClusterProxy::SelectStreamFactory"), - "Connections to remote replicas of local shard {} failed, will use stale local replica", shard_num); + "Connections to remote replicas of local shard {} failed, will use stale local replica", shard.shard_info.shard_num); else throw; } @@ -144,7 +140,7 @@ void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::IStreamFacto if (try_results.empty() || local_delay < max_remote_delay) { - auto plan = createLocalPlan(query, header, context, stage, shard_num, shard_count, coordinator); + auto plan = createLocalPlan(query, header, context, stage, shard.shard_info.shard_num, shard_count, 0, 0, /*coordinator=*/nullptr); return QueryPipelineBuilder::getPipe(std::move(*plan->buildQueryPipeline( QueryPlanOptimizationSettings::fromContext(context), @@ -160,10 +156,9 @@ void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::IStreamFacto String query_string = formattedAST(query); scalars["_shard_num"] - = Block{{DataTypeUInt32().createColumnConst(1, shard_num), std::make_shared(), "_shard_num"}}; + = Block{{DataTypeUInt32().createColumnConst(1, shard.shard_info.shard_num), std::make_shared(), "_shard_num"}}; auto remote_query_executor = std::make_shared( - pool, std::move(connections), query_string, header, context, throttler, scalars, external_tables, stage, - RemoteQueryExecutor::Extension{.parallel_reading_coordinator = std::move(coordinator), .replica_info = replica_info}); + shard.shard_info.pool, std::move(connections), query_string, header, context, throttler, scalars, external_tables, stage); return createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read); } @@ -174,10 +169,7 @@ void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::IStreamFacto addConvertingActions(pipes.back(), output_stream->header); } -void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::IStreamFactory::Shard & shard, - std::shared_ptr coordinator, - std::shared_ptr pool, - std::optional replica_info) +void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard) { bool add_agg_info = stage == QueryProcessingStage::WithMergeableState; bool add_totals = false; @@ -192,20 +184,15 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::IStreamFactory:: String query_string = formattedAST(shard.query); scalars["_shard_num"] - = Block{{DataTypeUInt32().createColumnConst(1, shard.shard_num), std::make_shared(), "_shard_num"}}; + = Block{{DataTypeUInt32().createColumnConst(1, shard.shard_info.shard_num), std::make_shared(), "_shard_num"}}; std::shared_ptr remote_query_executor; remote_query_executor = std::make_shared( - pool ? pool : shard.pool, query_string, shard.header, context, throttler, scalars, external_tables, stage, - RemoteQueryExecutor::Extension{.parallel_reading_coordinator = std::move(coordinator), .replica_info = std::move(replica_info)}); + shard.shard_info.pool, query_string, shard.header, context, throttler, scalars, external_tables, stage); remote_query_executor->setLogger(log); - - /// In case of parallel reading from replicas we have a connection pool per replica. - /// Setting PoolMode will make no sense. - if (!pool) - remote_query_executor->setPoolMode(PoolMode::GET_MANY); + remote_query_executor->setPoolMode(PoolMode::GET_MANY); if (!table_func_ptr) remote_query_executor->setMainTable(main_table); @@ -219,52 +206,119 @@ void ReadFromRemote::initializePipeline(QueryPipelineBuilder & pipeline, const B { Pipes pipes; - const auto & settings = context->getSettingsRef(); - const bool enable_sample_offset_parallel_processing = settings.max_parallel_replicas > 1 && !settings.allow_experimental_parallel_reading_from_replicas; - - /// We have to create a pipe for each replica - /// FIXME: The second condition is only for tests to work, because hedged connections enabled by default. - if (settings.max_parallel_replicas > 1 && !enable_sample_offset_parallel_processing && !context->getSettingsRef().use_hedged_requests) + for (const auto & shard : shards) { - const Settings & current_settings = context->getSettingsRef(); - auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); - - for (const auto & shard : shards) - { - auto coordinator = std::make_shared(); - - for (size_t replica_num = 0; replica_num < shard.num_replicas; ++replica_num) - { - IConnections::ReplicaInfo replica_info - { - .all_replicas_count = shard.num_replicas, - .number_of_current_replica = replica_num - }; - - auto pool = shard.per_replica_pools[replica_num]; - auto pool_with_failover = std::make_shared( - ConnectionPoolPtrs{pool}, current_settings.load_balancing); - - if (shard.lazy) - addLazyPipe(pipes, shard, coordinator, pool_with_failover, replica_info); - else - addPipe(pipes, shard, coordinator, pool_with_failover, replica_info); - } - } - } - else - { - for (const auto & shard : shards) - { - if (shard.lazy) - addLazyPipe(pipes, shard, /*coordinator=*/nullptr, /*pool*/{}, /*replica_info*/std::nullopt); - else - addPipe(pipes, shard, /*coordinator=*/nullptr, /*pool*/{}, /*replica_info*/std::nullopt); - } + if (shard.lazy) + addLazyPipe(pipes, shard); + else + addPipe(pipes, shard); } auto pipe = Pipe::unitePipes(std::move(pipes)); pipeline.init(std::move(pipe)); } + +ReadFromParallelRemoteReplicasStep::ReadFromParallelRemoteReplicasStep( + ParallelReplicasReadingCoordinatorPtr coordinator_, + ClusterProxy::SelectStreamFactory::Shard shard_, + Block header_, + QueryProcessingStage::Enum stage_, + StorageID main_table_, + ASTPtr table_func_ptr_, + ContextPtr context_, + ThrottlerPtr throttler_, + Scalars scalars_, + Tables external_tables_, + Poco::Logger * log_, + UInt32 shard_count_) + : ISourceStep(DataStream{.header = std::move(header_)}) + , coordinator(std::move(coordinator_)) + , shard(std::move(shard_)) + , stage(std::move(stage_)) + , main_table(std::move(main_table_)) + , table_func_ptr(table_func_ptr_) + , context(context_) + , throttler(throttler_) + , scalars(scalars_) + , external_tables{external_tables_} + , log(log_) + , shard_count(shard_count_) +{ + std::vector description; + + for (const auto & address : shard.shard_info.all_addresses) + if (!address.is_local) + description.push_back(fmt::format("Replica: {}", address.host_name)); + + setStepDescription(boost::algorithm::join(description, ", ")); +} + + +void ReadFromParallelRemoteReplicasStep::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) +{ + Pipes pipes; + + const Settings & current_settings = context->getSettingsRef(); + auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); + + for (size_t replica_num = 0; replica_num < shard.shard_info.getAllNodeCount(); ++replica_num) + { + if (shard.shard_info.all_addresses[replica_num].is_local) + continue; + + IConnections::ReplicaInfo replica_info + { + .all_replicas_count = shard.shard_info.getAllNodeCount(), + .number_of_current_replica = replica_num + }; + + auto pool = shard.shard_info.per_replica_pools[replica_num]; + assert(pool); + + auto pool_with_failover = std::make_shared( + ConnectionPoolPtrs{pool}, current_settings.load_balancing); + + addPipeForSingeReplica(pipes, pool_with_failover, replica_info); + } + + auto pipe = Pipe::unitePipes(std::move(pipes)); + pipeline.init(std::move(pipe)); + +} + + +void ReadFromParallelRemoteReplicasStep::addPipeForSingeReplica(Pipes & pipes, std::shared_ptr pool, IConnections::ReplicaInfo replica_info) +{ + bool add_agg_info = stage == QueryProcessingStage::WithMergeableState; + bool add_totals = false; + bool add_extremes = false; + bool async_read = context->getSettingsRef().async_socket_for_remote; + if (stage == QueryProcessingStage::Complete) + { + add_totals = shard.query->as().group_by_with_totals; + add_extremes = context->getSettingsRef().extremes; + } + + String query_string = formattedAST(shard.query); + + scalars["_shard_num"] + = Block{{DataTypeUInt32().createColumnConst(1, shard.shard_info.shard_num), std::make_shared(), "_shard_num"}}; + + std::shared_ptr remote_query_executor; + + remote_query_executor = std::make_shared( + pool, query_string, shard.header, context, throttler, scalars, external_tables, stage, + RemoteQueryExecutor::Extension{.parallel_reading_coordinator = coordinator, .replica_info = std::move(replica_info)}); + + remote_query_executor->setLogger(log); + + if (!table_func_ptr) + remote_query_executor->setMainTable(main_table); + + pipes.emplace_back(createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read)); + pipes.back().addInterpreterContext(context); + addConvertingActions(pipes.back(), output_stream->header); +} + } diff --git a/src/Processors/QueryPlan/ReadFromRemote.h b/src/Processors/QueryPlan/ReadFromRemote.h index f361be93b5a..e8f4ee5fd37 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.h +++ b/src/Processors/QueryPlan/ReadFromRemote.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include namespace DB @@ -22,7 +22,7 @@ class ReadFromRemote final : public ISourceStep { public: ReadFromRemote( - ClusterProxy::IStreamFactory::Shards shards_, + ClusterProxy::SelectStreamFactory::Shards shards_, Block header_, QueryProcessingStage::Enum stage_, StorageID main_table_, @@ -45,7 +45,7 @@ private: PerShard }; - ClusterProxy::IStreamFactory::Shards shards; + ClusterProxy::SelectStreamFactory::Shards shards; QueryProcessingStage::Enum stage; StorageID main_table; @@ -60,16 +60,52 @@ private: Poco::Logger * log; UInt32 shard_count; - void addLazyPipe(Pipes & pipes, const ClusterProxy::IStreamFactory::Shard & shard, - std::shared_ptr coordinator, - std::shared_ptr pool, - std::optional replica_info); - void addPipe(Pipes & pipes, const ClusterProxy::IStreamFactory::Shard & shard, - std::shared_ptr coordinator, - std::shared_ptr pool, - std::optional replica_info); + void addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard); + void addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard); +}; - void addPipeForReplica(); + +class ReadFromParallelRemoteReplicasStep : public ISourceStep +{ +public: + ReadFromParallelRemoteReplicasStep( + ParallelReplicasReadingCoordinatorPtr coordinator_, + ClusterProxy::SelectStreamFactory::Shard shard, + Block header_, + QueryProcessingStage::Enum stage_, + StorageID main_table_, + ASTPtr table_func_ptr_, + ContextPtr context_, + ThrottlerPtr throttler_, + Scalars scalars_, + Tables external_tables_, + Poco::Logger * log_, + UInt32 shard_count_); + + String getName() const override { return "ReadFromRemoteParallelReplicas"; } + + void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; + +private: + + void addPipeForSingeReplica(Pipes & pipes, std::shared_ptr pool, IConnections::ReplicaInfo replica_info); + + ParallelReplicasReadingCoordinatorPtr coordinator; + ClusterProxy::SelectStreamFactory::Shard shard; + QueryProcessingStage::Enum stage; + + StorageID main_table; + ASTPtr table_func_ptr; + + ContextPtr context; + + ThrottlerPtr throttler; + Scalars scalars; + Tables external_tables; + + Poco::Logger * log; + + UInt32 shard_count{0}; }; } diff --git a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h index af74e0fae49..bd2082be6c2 100644 --- a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h +++ b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h @@ -17,4 +17,6 @@ private: std::unique_ptr pimpl; }; +using ParallelReplicasReadingCoordinatorPtr = std::shared_ptr; + } diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 2c3a1489943..a5f20c56c9d 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -707,13 +707,25 @@ void StorageDistributed::read( storage_snapshot, processed_stage); - ClusterProxy::executeQuery( - query_plan, header, processed_stage, - main_table, remote_table_function_ptr, - select_stream_factory, log, modified_query_ast, - local_context, query_info, - sharding_key_expr, sharding_key_column_name, - query_info.cluster); + + auto settings = local_context->getSettingsRef(); + bool parallel_replicas = settings.max_parallel_replicas > 1 && settings.allow_experimental_parallel_reading_from_replicas && !settings.use_hedged_requests; + + if (parallel_replicas) + ClusterProxy::executeQueryWithParallelReplicas( + query_plan, main_table, remote_table_function_ptr, + select_stream_factory, modified_query_ast, + local_context, query_info, + sharding_key_expr, sharding_key_column_name, + query_info.cluster); + else + ClusterProxy::executeQuery( + query_plan, header, processed_stage, + main_table, remote_table_function_ptr, + select_stream_factory, log, modified_query_ast, + local_context, query_info, + sharding_key_expr, sharding_key_column_name, + query_info.cluster); /// This is a bug, it is possible only when there is no shards to query, and this is handled earlier. if (!query_plan.isInitialized()) @@ -1523,4 +1535,3 @@ void registerStorageDistributed(StorageFactory & factory) } } - diff --git a/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh b/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh index bbb5d903ea9..ecd0d281b53 100755 --- a/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh +++ b/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh @@ -10,7 +10,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # All replicas are localhost, disable `prefer_localhost_replica` option to test network interface # Currently this feature could not work with hedged requests # Enabling `enable_sample_offset_parallel_processing` feature could lead to intersecting marks, so some of them would be thrown away and it will lead to incorrect result of SELECT query -SETTINGS="--max_parallel_replicas=3 --use_hedged_requests=false --async_socket_for_remote=false --allow_experimental_parallel_reading_from_replicas=true" +SETTINGS="--max_parallel_replicas=3 --use_hedged_requests=false --allow_experimental_parallel_reading_from_replicas=true" # Prepare tables $CLICKHOUSE_CLIENT $SETTINGS -nm -q ''' From 6a516099152c8c14ffbcc6ffdc79c46077918889 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 2 Jun 2022 13:35:23 +0300 Subject: [PATCH 68/69] Update 02316_const_string_intersact.sql --- tests/queries/0_stateless/02316_const_string_intersact.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02316_const_string_intersact.sql b/tests/queries/0_stateless/02316_const_string_intersact.sql index ace3c8d03c5..18af398aa5d 100644 --- a/tests/queries/0_stateless/02316_const_string_intersact.sql +++ b/tests/queries/0_stateless/02316_const_string_intersact.sql @@ -1 +1,3 @@ +-- Tags: no-backward-compatibility-check + SELECT 'Play ClickHouse' InterSect SELECT 'Play ClickHouse' From 4baae59252ae629f39a1e1562c3e057abb72d5eb Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 2 Jun 2022 14:04:28 +0300 Subject: [PATCH 69/69] Revert "Fix possible segfault in schema inference" --- src/Formats/ReadSchemaUtils.cpp | 21 ++++++++----------- ...18_template_schema_inference_bug.reference | 0 .../02318_template_schema_inference_bug.sql | 2 -- 3 files changed, 9 insertions(+), 14 deletions(-) delete mode 100644 tests/queries/0_stateless/02318_template_schema_inference_bug.reference delete mode 100755 tests/queries/0_stateless/02318_template_schema_inference_bug.sql diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 11a91bd50dc..035546031d8 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -100,21 +100,18 @@ ColumnsDescription readSchemaFromFormat( catch (...) { auto exception_message = getCurrentExceptionMessage(false); - if (schema_reader) + size_t rows_read = schema_reader->getNumRowsRead(); + assert(rows_read <= max_rows_to_read); + max_rows_to_read -= schema_reader->getNumRowsRead(); + if (rows_read != 0 && max_rows_to_read == 0) { - size_t rows_read = schema_reader->getNumRowsRead(); - assert(rows_read <= max_rows_to_read); - max_rows_to_read -= schema_reader->getNumRowsRead(); - if (rows_read != 0 && max_rows_to_read == 0) + exception_message += "\nTo increase the maximum number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference"; + if (iterations > 1) { - exception_message += "\nTo increase the maximum number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference"; - if (iterations > 1) - { - exception_messages += "\n" + exception_message; - break; - } - retry = false; + exception_messages += "\n" + exception_message; + break; } + retry = false; } if (!retry || !isRetryableSchemaInferenceError(getCurrentExceptionCode())) diff --git a/tests/queries/0_stateless/02318_template_schema_inference_bug.reference b/tests/queries/0_stateless/02318_template_schema_inference_bug.reference deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/queries/0_stateless/02318_template_schema_inference_bug.sql b/tests/queries/0_stateless/02318_template_schema_inference_bug.sql deleted file mode 100755 index 42646013dd5..00000000000 --- a/tests/queries/0_stateless/02318_template_schema_inference_bug.sql +++ /dev/null @@ -1,2 +0,0 @@ -insert into function file(data_02318.tsv) select * from numbers(10); -desc file('data_02318.tsv', 'Template') SETTINGS format_template_row='nonexist', format_template_resultset='nonexist'; -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE}