From b7529986c0bc4ffe3ad4ebbbb6cf366091168374 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 1 Jul 2022 12:09:32 +0000 Subject: [PATCH 001/164] Use separate counter for RSS in global memory tracker. --- src/Common/MemoryTracker.cpp | 61 ++++++++++++++---------- src/Common/MemoryTracker.h | 6 ++- src/Interpreters/AsynchronousMetrics.cpp | 19 +++++--- src/Interpreters/AsynchronousMetrics.h | 1 + 4 files changed, 53 insertions(+), 34 deletions(-) diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index 51f4c83dc23..ba097568477 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -88,6 +88,7 @@ static constexpr size_t log_peak_memory_usage_every = 1ULL << 30; MemoryTracker total_memory_tracker(nullptr, VariableContext::Global); +std::atomic MemoryTracker::rss; MemoryTracker::MemoryTracker(VariableContext level_) : parent(&total_memory_tracker), level(level_) {} MemoryTracker::MemoryTracker(MemoryTracker * parent_, VariableContext level_) : parent(parent_), level(level_) {} @@ -131,6 +132,16 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT if (MemoryTrackerBlockerInThread::isBlocked(level)) { + if (level == VariableContext::Global) + { + /// For global memory tracker always update memory usage. + amount.fetch_add(size, std::memory_order_relaxed); + + auto metric_loaded = metric.load(std::memory_order_relaxed); + if (metric_loaded != CurrentMetrics::end()) + CurrentMetrics::add(metric_loaded, size); + } + /// Since the MemoryTrackerBlockerInThread should respect the level, we should go to the next parent. if (auto * loaded_next = parent.load(std::memory_order_relaxed)) loaded_next->allocImpl(size, throw_if_memory_exceeded, @@ -151,24 +162,6 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT Int64 current_hard_limit = hard_limit.load(std::memory_order_relaxed); Int64 current_profiler_limit = profiler_limit.load(std::memory_order_relaxed); - /// Cap the limit to the total_memory_tracker, since it may include some drift - /// for user-level memory tracker. - /// - /// And since total_memory_tracker is reset to the process resident - /// memory peridically (in AsynchronousMetrics::update()), any limit can be - /// capped to it, to avoid possible drift. - if (unlikely(current_hard_limit - && will_be > current_hard_limit - && level == VariableContext::User)) - { - Int64 total_amount = total_memory_tracker.get(); - if (amount > total_amount) - { - set(total_amount); - will_be = size + total_amount; - } - } - #ifdef MEMORY_TRACKER_DEBUG_CHECKS if (unlikely(memory_tracker_always_throw_logical_error_on_allocation)) { @@ -214,6 +207,16 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT allocation_traced = true; } + bool used_rss_counter = false; + if (level == VariableContext::Global) + { + if (Int64 current_rss = rss.load(std::memory_order_relaxed); unlikely(current_rss + size > will_be)) + { + used_rss_counter = true; + will_be = current_rss + size; + } + } + if (unlikely(current_hard_limit && will_be > current_hard_limit) && memoryTrackerCanThrow(level, false) && throw_if_memory_exceeded) { OvercommitResult overcommit_result = OvercommitResult::NONE; @@ -228,9 +231,10 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT const auto * description = description_ptr.load(std::memory_order_relaxed); throw DB::Exception( DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED, - "Memory limit{}{} exceeded: would use {} (attempt to allocate chunk of {} bytes), maximum: {}. OvercommitTracker decision: {}.", + "Memory limit{}{} {}exceeded: would use {} (attempt to allocate chunk of {} bytes), maximum: {}. OvercommitTracker decision: {}.", description ? " " : "", description ? description : "", + used_rss_counter ? "(RSS) " : "", formatReadableSizeWithBinarySuffix(will_be), size, formatReadableSizeWithBinarySuffix(current_hard_limit), @@ -303,6 +307,16 @@ void MemoryTracker::free(Int64 size) { if (MemoryTrackerBlockerInThread::isBlocked(level)) { + if (level == VariableContext::Global) + { + /// For global memory tracker always update memory usage. + amount.fetch_sub(size, std::memory_order_relaxed); + + auto metric_loaded = metric.load(std::memory_order_relaxed); + if (metric_loaded != CurrentMetrics::end()) + CurrentMetrics::add(metric_loaded, size); + } + /// Since the MemoryTrackerBlockerInThread should respect the level, we should go to the next parent. if (auto * loaded_next = parent.load(std::memory_order_relaxed)) loaded_next->free(size); @@ -317,7 +331,7 @@ void MemoryTracker::free(Int64 size) } Int64 accounted_size = size; - if (level == VariableContext::Thread) + if (level == VariableContext::Thread || level == VariableContext::Global) { /// Could become negative if memory allocated in this thread is freed in another one amount.fetch_sub(accounted_size, std::memory_order_relaxed); @@ -391,12 +405,9 @@ void MemoryTracker::reset() } -void MemoryTracker::set(Int64 to) +void MemoryTracker::setRSS(Int64 to) { - amount.store(to, std::memory_order_relaxed); - - bool log_memory_usage = true; - updatePeak(to, log_memory_usage); + rss.store(to, std::memory_order_relaxed); } diff --git a/src/Common/MemoryTracker.h b/src/Common/MemoryTracker.h index 58bd3a460bd..b66706aafe8 100644 --- a/src/Common/MemoryTracker.h +++ b/src/Common/MemoryTracker.h @@ -56,6 +56,8 @@ private: std::atomic hard_limit {0}; std::atomic profiler_limit {0}; + static std::atomic rss; + Int64 profiler_step = 0; /// To test exception safety of calling code, memory tracker throws an exception on each memory allocation with specified probability. @@ -212,8 +214,8 @@ public: /// Reset the accumulated data. void reset(); - /// Reset current counter to a new value. - void set(Int64 to); + /// Update RSS. + static void setRSS(Int64 to); /// Prints info about peak memory consumption into log. void logPeakMemoryUsage() const; diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 37ed418ec2a..9275c1d6840 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -670,21 +670,26 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti { Int64 amount = total_memory_tracker.get(); Int64 peak = total_memory_tracker.getPeak(); - Int64 new_amount = data.resident; + Int64 rss = data.resident; - Int64 difference = new_amount - amount; + Int64 rss_drift = rss - amount; + Int64 difference = rss_drift - last_logged_rss_drift; /// Log only if difference is high. This is for convenience. The threshold is arbitrary. if (difference >= 1048576 || difference <= -1048576) + { LOG_TRACE(log, - "MemoryTracking: was {}, peak {}, will set to {} (RSS), difference: {}", + "MemoryTracking: allocated {}, peak {}, RSS {}, difference: {}", ReadableSize(amount), ReadableSize(peak), - ReadableSize(new_amount), - ReadableSize(difference)); + ReadableSize(rss), + ReadableSize(rss_drift)); - total_memory_tracker.set(new_amount); - CurrentMetrics::set(CurrentMetrics::MemoryTracking, new_amount); + last_logged_rss_drift = rss_drift; + } + + total_memory_tracker.setRSS(rss); + // CurrentMetrics::set(CurrentMetrics::MemoryTracking, new_amount); } } #endif diff --git a/src/Interpreters/AsynchronousMetrics.h b/src/Interpreters/AsynchronousMetrics.h index e4bcb2890f3..3ba84219cb2 100644 --- a/src/Interpreters/AsynchronousMetrics.h +++ b/src/Interpreters/AsynchronousMetrics.h @@ -78,6 +78,7 @@ private: #if defined(OS_LINUX) || defined(OS_FREEBSD) MemoryStatisticsOS memory_stat; + Int64 last_logged_rss_drift = 0; #endif #if defined(OS_LINUX) From 12f4a489577e00095ed50d00478408bcfd434ad9 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 8 Jul 2022 06:48:05 +0000 Subject: [PATCH 002/164] Extend LUT range to 1900..2300 --- src/Common/DateLUTImpl.h | 76 ++++++++++++++++++++++++++++---------- src/Functions/makeDate.cpp | 7 ++-- 2 files changed, 60 insertions(+), 23 deletions(-) diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index 4bc9614abcb..209afc9e6f0 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -10,20 +10,27 @@ #include -#define DATE_LUT_MIN_YEAR 1925 /// 1925 since wast majority of timezones changed to 15-minute aligned offsets somewhere in 1924 or earlier. -#define DATE_LUT_MAX_YEAR 2283 /// Last supported year (complete) +#define DATE_LUT_MIN_YEAR 1900 /// 1900 since majority of financial organizations consider 1900 as an initial year. +// #define DATE_LUT_MAX_YEAR 2258 /// Last supported year (complete) +#define DATE_LUT_MAX_YEAR 2300 /// Last supported year (complete) #define DATE_LUT_YEARS (1 + DATE_LUT_MAX_YEAR - DATE_LUT_MIN_YEAR) /// Number of years in lookup table -#define DATE_LUT_SIZE 0x20000 +// #define DATE_LUT_SIZE 0x20000 +#define DATE_LUT_SIZE 0x23C1E + #define DATE_LUT_MAX (0xFFFFFFFFU - 86400) #define DATE_LUT_MAX_DAY_NUM 0xFFFF + +#define DAYNUM_OFFSET_EPOCH 25567 + /// Max int value of Date32, DATE LUT cache size minus daynum_offset_epoch -#define DATE_LUT_MAX_EXTEND_DAY_NUM (DATE_LUT_SIZE - 16436) +// #define DATE_LUT_MAX_EXTEND_DAY_NUM (DATE_LUT_SIZE - (Time)DAYNUM_OFFSET_EPOCH) +#define DATE_LUT_MAX_EXTEND_DAY_NUM (DATE_LUT_SIZE - 25567) /// A constant to add to time_t so every supported time point becomes non-negative and still has the same remainder of division by 3600. /// If we treat "remainder of division" operation in the sense of modular arithmetic (not like in C++). -#define DATE_LUT_ADD ((1970 - DATE_LUT_MIN_YEAR) * 366 * 86400) +#define DATE_LUT_ADD ((1970 - DATE_LUT_MIN_YEAR) * 366L * 86400) #if defined(__PPC__) @@ -64,62 +71,88 @@ private: // Same as above but select different function overloads for zero saturation. STRONG_TYPEDEF(UInt32, LUTIndexWithSaturation) + static inline LUTIndex normalizeLUTIndex(UInt32 index) + { + if (index >= DATE_LUT_SIZE) + LUTIndex(DATE_LUT_SIZE - 1); + return LUTIndex{index}; + } + + static inline LUTIndex normalizeLUTIndex(Int64 index) + { + if (index < 0 ) + return LUTIndex(0); + if (index >= DATE_LUT_SIZE) + LUTIndex(DATE_LUT_SIZE - 1); + return LUTIndex{index}; + } + template friend inline LUTIndex operator+(const LUTIndex & index, const T v) { - return LUTIndex{(index.toUnderType() + UInt32(v)) & date_lut_mask}; + return normalizeLUTIndex(index.toUnderType() + UInt32(v)); + //return LUTIndex{(index.toUnderType() + UInt32(v)) & date_lut_mask}; } template friend inline LUTIndex operator+(const T v, const LUTIndex & index) { - return LUTIndex{(v + index.toUnderType()) & date_lut_mask}; + return normalizeLUTIndex(v + index.toUnderType()); + //return LUTIndex{(v + index.toUnderType()) & date_lut_mask}; } friend inline LUTIndex operator+(const LUTIndex & index, const LUTIndex & v) { - return LUTIndex{(index.toUnderType() + v.toUnderType()) & date_lut_mask}; + return normalizeLUTIndex(static_cast(index.toUnderType() + v.toUnderType())); + //return LUTIndex{(index.toUnderType() + v.toUnderType()) & date_lut_mask}; } template friend inline LUTIndex operator-(const LUTIndex & index, const T v) { - return LUTIndex{(index.toUnderType() - UInt32(v)) & date_lut_mask}; + return normalizeLUTIndex(static_cast(index.toUnderType() - UInt32(v))); + //return LUTIndex{(index.toUnderType() - UInt32(v)) & date_lut_mask}; } template friend inline LUTIndex operator-(const T v, const LUTIndex & index) { - return LUTIndex{(v - index.toUnderType()) & date_lut_mask}; + return normalizeLUTIndex(static_cast(v - index.toUnderType())); + //return LUTIndex{(v - index.toUnderType()) & date_lut_mask}; } friend inline LUTIndex operator-(const LUTIndex & index, const LUTIndex & v) { - return LUTIndex{(index.toUnderType() - v.toUnderType()) & date_lut_mask}; + return normalizeLUTIndex(static_cast(index.toUnderType() - v.toUnderType())); + //return LUTIndex{(index.toUnderType() - v.toUnderType()) & date_lut_mask}; } template friend inline LUTIndex operator*(const LUTIndex & index, const T v) { - return LUTIndex{(index.toUnderType() * UInt32(v)) & date_lut_mask}; + return normalizeLUTIndex(index.toUnderType() * UInt32(v)); + // return LUTIndex{(index.toUnderType() * UInt32(v)) /*& date_lut_mask*/}; } template friend inline LUTIndex operator*(const T v, const LUTIndex & index) { - return LUTIndex{(v * index.toUnderType()) & date_lut_mask}; + return normalizeLUTIndex(v * index.toUnderType()); + // return LUTIndex{(v * index.toUnderType()) /*& date_lut_mask*/}; } template friend inline LUTIndex operator/(const LUTIndex & index, const T v) { - return LUTIndex{(index.toUnderType() / UInt32(v)) & date_lut_mask}; + return normalizeLUTIndex(index.toUnderType() / UInt32(v)); + // return LUTIndex{(index.toUnderType() / UInt32(v)) /*& date_lut_mask*/}; } template friend inline LUTIndex operator/(const T v, const LUTIndex & index) { - return LUTIndex{(UInt32(v) / index.toUnderType()) & date_lut_mask}; + return normalizeLUTIndex(UInt32(v) / index.toUnderType()); + // return LUTIndex{(UInt32(v) / index.toUnderType()) /*& date_lut_mask*/}; } public: @@ -170,12 +203,13 @@ public: private: /// Mask is all-ones to allow efficient protection against overflow. - static constexpr UInt32 date_lut_mask = 0x1ffff; - static_assert(date_lut_mask == DATE_LUT_SIZE - 1); + // static constexpr UInt32 date_lut_mask = 0x1ffff; + // static_assert(date_lut_mask == DATE_LUT_SIZE - 1); /// Offset to epoch in days (ExtendedDayNum) of the first day in LUT. /// "epoch" is the Unix Epoch (starts at unix timestamp zero) - static constexpr UInt32 daynum_offset_epoch = 16436; + // static constexpr UInt32 daynum_offset_epoch = DAYNUM_OFFSET_EPOCH; + static constexpr UInt32 daynum_offset_epoch = 25567; static_assert(daynum_offset_epoch == (1970 - DATE_LUT_MIN_YEAR) * 365 + (1970 - DATE_LUT_MIN_YEAR / 4 * 4) / 4); /// Lookup table is indexed by LUTIndex. @@ -232,12 +266,14 @@ private: static inline LUTIndex toLUTIndex(DayNum d) { - return LUTIndex{(d + daynum_offset_epoch) & date_lut_mask}; + return normalizeLUTIndex(d + daynum_offset_epoch); + // return LUTIndex{(d + daynum_offset_epoch) /*& date_lut_mask*/}; } static inline LUTIndex toLUTIndex(ExtendedDayNum d) { - return LUTIndex{static_cast(d + daynum_offset_epoch) & date_lut_mask}; + return normalizeLUTIndex(static_cast(d + daynum_offset_epoch)); + // return LUTIndex{static_cast(d + daynum_offset_epoch) /*& date_lut_mask*/}; } inline LUTIndex toLUTIndex(Time t) const diff --git a/src/Functions/makeDate.cpp b/src/Functions/makeDate.cpp index dbf29322787..d97af38dbea 100644 --- a/src/Functions/makeDate.cpp +++ b/src/Functions/makeDate.cpp @@ -164,9 +164,10 @@ struct MakeDate32Traits using ReturnDataType = DataTypeDate32; using ReturnColumnType = ColumnInt32; - static constexpr auto MIN_YEAR = 1925; - static constexpr auto MAX_YEAR = 2283; - static constexpr auto MAX_DATE = YearMonthDayToSingleInt(MAX_YEAR, 11, 11); + static constexpr auto MIN_YEAR = 1900; //1925; + static constexpr auto MAX_YEAR = 2300; //2283; + // static constexpr auto MAX_DATE = YearMonthDayToSingleInt(MAX_YEAR, 11, 11); + static constexpr auto MAX_DATE = YearMonthDayToSingleInt(MAX_YEAR, 12, 31); }; /// Common implementation for makeDateTime, makeDateTime64 From 7cde9d3b404024ebc2539a9034deca66c12f52be Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 13 Jul 2022 15:57:55 +0000 Subject: [PATCH 003/164] Add new features in schema inference --- src/Core/Settings.h | 4 + src/DataTypes/transformTypesRecursively.cpp | 181 ++++++++++++ src/DataTypes/transformTypesRecursively.h | 17 ++ src/Formats/EscapingRuleUtils.cpp | 278 +++++++++++++++++- src/Formats/EscapingRuleUtils.h | 17 ++ src/Formats/FormatFactory.cpp | 4 + src/Formats/FormatSettings.h | 4 + src/Formats/JSONUtils.cpp | 115 +++++--- src/Formats/JSONUtils.h | 6 +- src/Formats/ReadSchemaUtils.cpp | 23 +- src/Processors/Formats/ISchemaReader.cpp | 76 +++-- src/Processors/Formats/ISchemaReader.h | 20 +- .../Impl/CustomSeparatedRowInputFormat.cpp | 5 + .../Impl/CustomSeparatedRowInputFormat.h | 2 + .../Impl/JSONColumnsBlockInputFormatBase.cpp | 12 +- .../Impl/JSONColumnsBlockInputFormatBase.h | 2 +- .../Impl/JSONCompactEachRowRowInputFormat.cpp | 13 +- .../Impl/JSONCompactEachRowRowInputFormat.h | 2 + .../Impl/JSONEachRowRowInputFormat.cpp | 18 +- .../Formats/Impl/JSONEachRowRowInputFormat.h | 1 + .../Formats/Impl/MySQLDumpRowInputFormat.cpp | 2 +- .../Formats/Impl/MySQLDumpRowInputFormat.h | 1 - .../Formats/Impl/RegexpRowInputFormat.cpp | 7 +- .../Formats/Impl/RegexpRowInputFormat.h | 4 +- .../Formats/Impl/TSKVRowInputFormat.cpp | 3 +- .../Formats/Impl/TSKVRowInputFormat.h | 1 - .../Formats/Impl/TemplateRowInputFormat.cpp | 6 +- .../Formats/Impl/TemplateRowInputFormat.h | 3 +- .../Formats/Impl/ValuesBlockInputFormat.cpp | 2 +- .../Formats/Impl/ValuesBlockInputFormat.h | 1 - .../RowInputFormatWithNamesAndTypes.cpp | 4 +- .../Formats/RowInputFormatWithNamesAndTypes.h | 6 +- ...02247_read_bools_as_numbers_json.reference | 12 + .../02247_read_bools_as_numbers_json.sh | 19 ++ .../02268_json_maps_and_objects.reference | 3 + .../02268_json_maps_and_objects.sql | 3 + .../02325_dates_schema_inference.reference | 60 ++++ .../02325_dates_schema_inference.sql | 68 +++++ ...om_json_strings_schema_inference.reference | 17 ++ ...ers_from_json_strings_schema_inference.sql | 19 ++ ..._infer_integers_schema_inference.reference | 36 +++ ...27_try_infer_integers_schema_inference.sql | 43 +++ 42 files changed, 994 insertions(+), 126 deletions(-) create mode 100644 src/DataTypes/transformTypesRecursively.cpp create mode 100644 src/DataTypes/transformTypesRecursively.h create mode 100644 tests/queries/0_stateless/02325_dates_schema_inference.reference create mode 100644 tests/queries/0_stateless/02325_dates_schema_inference.sql create mode 100644 tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.reference create mode 100644 tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.sql create mode 100644 tests/queries/0_stateless/02327_try_infer_integers_schema_inference.reference create mode 100644 tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 5597d9076a4..b0e7f554717 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -686,6 +686,10 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Arrow", 0) \ M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \ M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \ + M(Bool, input_format_json_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference", 0) \ + M(Bool, input_format_try_infer_integers, false, "Try to infer numbers from string fields while schema inference in text formats", 0) \ + M(Bool, input_format_try_infer_dates, false, "Try to infer dates from string fields while schema inference in text formats", 0) \ + M(Bool, input_format_try_infer_datetimes, false, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \ M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \ M(UInt64, input_format_csv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in CSV format", 0) \ diff --git a/src/DataTypes/transformTypesRecursively.cpp b/src/DataTypes/transformTypesRecursively.cpp new file mode 100644 index 00000000000..2f1b689a233 --- /dev/null +++ b/src/DataTypes/transformTypesRecursively.cpp @@ -0,0 +1,181 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +void transformTypesRecursively(DataTypes & types, std::function transform_simple_types, std::function transform_complex_types) +{ + { + /// Arrays + bool have_array = false; + bool all_arrays = true; + DataTypes nested_types; + for (const auto & type : types) + { + if (const DataTypeArray * type_array = typeid_cast(type.get())) + { + have_array = true; + nested_types.push_back(type_array->getNestedType()); + } + else + all_arrays = false; + } + + if (have_array) + { + if (all_arrays) + { + transformTypesRecursively(nested_types, transform_simple_types, transform_complex_types); + for (size_t i = 0; i != types.size(); ++i) + types[i] = std::make_shared(nested_types[i]); + } + + if (transform_complex_types) + transform_complex_types(types); + + return; + } + } + + { + /// Tuples + bool have_tuple = false; + bool all_tuples = true; + size_t tuple_size = 0; + + std::vector nested_types; + + for (const auto & type : types) + { + if (const DataTypeTuple * type_tuple = typeid_cast(type.get())) + { + if (!have_tuple) + { + tuple_size = type_tuple->getElements().size(); + nested_types.resize(tuple_size); + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + nested_types[elem_idx].reserve(types.size()); + } + else if (tuple_size != type_tuple->getElements().size()) + return; + + have_tuple = true; + + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + nested_types[elem_idx].emplace_back(type_tuple->getElements()[elem_idx]); + } + else + all_tuples = false; + } + + if (have_tuple) + { + if (all_tuples) + { + std::vector transposed_nested_types(types.size()); + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + { + transformTypesRecursively(nested_types[elem_idx], transform_simple_types, transform_complex_types); + for (size_t i = 0; i != types.size(); ++i) + transposed_nested_types[i].push_back(nested_types[elem_idx][i]); + } + + for (size_t i = 0; i != types.size(); ++i) + types[i] = std::make_shared(transposed_nested_types[i]); + + if (transform_complex_types) + transform_complex_types(types); + } + + if (transform_complex_types) + transform_complex_types(types); + + return; + } + } + + { + /// Maps + bool have_maps = false; + bool all_maps = true; + DataTypes key_types; + DataTypes value_types; + key_types.reserve(types.size()); + value_types.reserve(types.size()); + + for (const auto & type : types) + { + if (const DataTypeMap * type_map = typeid_cast(type.get())) + { + have_maps = true; + key_types.emplace_back(type_map->getKeyType()); + value_types.emplace_back(type_map->getValueType()); + } + else + all_maps = false; + } + + if (have_maps) + { + if (all_maps) + { + transformTypesRecursively(key_types, transform_simple_types, transform_complex_types); + transformTypesRecursively(value_types, transform_simple_types, transform_complex_types); + + for (size_t i = 0; i != types.size(); ++i) + types[i] = std::make_shared(key_types[i], value_types[i]); + } + + if (transform_complex_types) + transform_complex_types(types); + + return; + } + } + + { + /// Nullable + bool have_nullable = false; + std::vector is_nullable; + is_nullable.reserve(types.size()); + DataTypes nested_types; + nested_types.reserve(types.size()); + for (const auto & type : types) + { + if (const DataTypeNullable * type_nullable = typeid_cast(type.get())) + { + have_nullable = true; + is_nullable.push_back(1); + nested_types.push_back(type_nullable->getNestedType()); + } + else + { + is_nullable.push_back(0); + nested_types.push_back(type); + } + } + + if (have_nullable) + { + transformTypesRecursively(nested_types, transform_simple_types, transform_complex_types); + for (size_t i = 0; i != types.size(); ++i) + { + if (is_nullable[i]) + types[i] = makeNullable(nested_types[i]); + else + types[i] = nested_types[i]; + } + + return; + } + } + + transform_simple_types(types); +} + +} diff --git a/src/DataTypes/transformTypesRecursively.h b/src/DataTypes/transformTypesRecursively.h new file mode 100644 index 00000000000..5cb8f095494 --- /dev/null +++ b/src/DataTypes/transformTypesRecursively.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Function that applies custom transformation functions to provided types recursively. +/// Implementation is similar to function getLeastSuperType: +/// If all types are Array/Map/Tuple/Nullable, this function will be called to nested types. +/// If not all types are the same complex type (Array/Map/Tuple), this function won't be called to nested types. +/// Function transform_simple_types will be applied to resulting simple types after all recursive calls. +/// Function transform_complex_types will be applied to complex types (Array/Map/Tuple) after recursive call to their nested types. +void transformTypesRecursively(DataTypes & types, std::function transform_simple_types, std::function transform_complex_types); + +} diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 5aab8909a0c..0ae7918f682 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -9,8 +9,12 @@ #include #include #include -#include +#include +#include #include +#include +#include +#include #include #include #include @@ -255,7 +259,215 @@ String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule e return readByEscapingRule(buf, escaping_rule, format_settings); } -static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) +void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, bool is_json, const std::unordered_set * numbers_parsed_from_json_strings = nullptr) +{ + /// Do nothing if we didn't try to infer something special. + if (!settings.try_infer_integers && !settings.try_infer_dates && !settings.try_infer_datetimes && !is_json) + return; + + auto transform_simple_types = [&](DataTypes & data_types) + { + /// If we have floats and integers convert them all to float. + if (settings.try_infer_integers) + { + bool have_floats = false; + bool have_integers = false; + for (const auto & type : data_types) + { + have_floats |= isFloat(type); + have_integers |= isInteger(type) && !isBool(type); + } + + if (have_floats && have_integers) + { + for (auto & type : data_types) + { + if (isInteger(type)) + type = std::make_shared(); + } + } + } + + /// If we have date/datetimes and smth else, convert them to string. + /// If we have only dates and datetimes, convert dates to datetime. + if (settings.try_infer_dates || settings.try_infer_datetimes) + { + bool have_dates = false; + bool have_datetimes = false; + bool all_dates_or_datetimes = true; + + for (const auto & type : data_types) + { + have_dates |= isDate(type); + have_datetimes |= isDateTime64(type); + all_dates_or_datetimes &= isDate(type) || isDateTime64(type); + } + + if (!all_dates_or_datetimes && (have_dates || have_datetimes)) + { + for (auto & type : data_types) + { + if (isDate(type) || isDateTime64(type)) + type = std::make_shared(); + } + } + else if (have_dates && have_datetimes) + { + for (auto & type : data_types) + { + if (isDate(type)) + type = std::make_shared(9); + } + } + } + + if (!is_json) + return; + + /// Check settings specific for JSON formats. + + /// If we have numbers and strings, convert numbers to strings. + /// (Actually numbers could not be parsed from + if (settings.json.try_infer_numbers_from_strings) + { + bool have_strings = false; + bool have_numbers = false; + for (const auto & type : data_types) + { + have_strings |= isString(type); + have_numbers |= isNumber(type); + } + + if (have_strings && have_numbers) + { + for (auto & type : data_types) + { + if (isNumber(type) && (!numbers_parsed_from_json_strings || numbers_parsed_from_json_strings->contains(type.get()))) + type = std::make_shared(); + } + } + } + + if (settings.json.read_bools_as_numbers) + { + bool have_floats = false; + bool have_integers = false; + bool have_bools = false; + for (const auto & type : data_types) + { + have_floats |= isFloat(type); + have_integers |= isInteger(type) && !isBool(type); + have_bools |= isBool(type); + } + + if (have_bools && (have_integers || have_floats)) + { + for (auto & type : data_types) + { + if (isBool(type)) + { + if (have_integers) + type = std::make_shared(); + else + type = std::make_shared(); + } + } + } + } + }; + + auto transform_complex_types = [&](DataTypes & data_types) + { + if (!is_json) + return; + + bool have_maps = false; + bool have_objects = false; + bool are_maps_equal = true; + DataTypePtr first_map_type; + for (const auto & type : data_types) + { + if (isMap(type)) + { + if (!have_maps) + { + first_map_type = type; + have_maps = true; + } + else + { + are_maps_equal &= type->equals(*first_map_type); + if (!type->equals(*first_map_type)) + LOG_DEBUG(&Poco::Logger::get("SchemaInference"), "Maps {} and {} are different", type->getName(), first_map_type->getName()); + } + } + else if (isObject(type)) + { + have_objects = true; + } + } + + if (have_maps && (have_objects || !are_maps_equal)) + { + for (auto & type : data_types) + { + if (isMap(type)) + type = std::make_shared("json", true); + } + } + }; + + transformTypesRecursively(types, transform_simple_types, transform_complex_types); +} + +void transformInferredTypesIfNeeded(DataTypes & types, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule) +{ + transformInferredTypesIfNeededImpl(types, settings, escaping_rule == FormatSettings::EscapingRule::JSON); +} + +void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule) +{ + DataTypes types = {first, second}; + transformInferredTypesIfNeeded(types, settings, escaping_rule); + first = std::move(types[0]); + second = std::move(types[1]); +} + +void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set * numbers_parsed_from_json_strings) +{ + transformInferredTypesIfNeededImpl(types, settings, true, numbers_parsed_from_json_strings); +} + +void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings) +{ + DataTypes types = {first, second}; + transformInferredJSONTypesIfNeeded(types, settings); + first = std::move(types[0]); + second = std::move(types[1]); +} + +DataTypePtr tryInferDateOrDateTime(const std::string_view & field, const FormatSettings & settings) +{ + if (settings.try_infer_dates) + { + ReadBufferFromString buf(field); + DayNum tmp; + if (tryReadDateText(tmp, buf) && buf.eof()) + return makeNullable(std::make_shared()); + } + + if (settings.try_infer_datetimes) + { + ReadBufferFromString buf(field); + DateTime64 tmp; + if (tryReadDateTime64Text(tmp, 9, buf) && buf.eof()) + return makeNullable(std::make_shared(9)); + } + + return nullptr; +} + +static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBufferFromString & buf, const FormatSettings & settings) { if (buf.eof()) return nullptr; @@ -279,7 +491,7 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) else first = false; - auto nested_type = determineDataTypeForSingleFieldImpl(buf); + auto nested_type = determineDataTypeForSingleFieldImpl(buf, settings); if (!nested_type) return nullptr; @@ -294,6 +506,8 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) if (nested_types.empty()) return std::make_shared(std::make_shared()); + transformInferredTypesIfNeeded(nested_types, settings); + auto least_supertype = tryGetLeastSupertype(nested_types); if (!least_supertype) return nullptr; @@ -320,7 +534,7 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) else first = false; - auto nested_type = determineDataTypeForSingleFieldImpl(buf); + auto nested_type = determineDataTypeForSingleFieldImpl(buf, settings); if (!nested_type) return nullptr; @@ -355,7 +569,7 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) else first = false; - auto key_type = determineDataTypeForSingleFieldImpl(buf); + auto key_type = determineDataTypeForSingleFieldImpl(buf, settings); if (!key_type) return nullptr; @@ -366,7 +580,7 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) return nullptr; skipWhitespaceIfAny(buf); - auto value_type = determineDataTypeForSingleFieldImpl(buf); + auto value_type = determineDataTypeForSingleFieldImpl(buf, settings); if (!value_type) return nullptr; @@ -382,6 +596,9 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) if (key_types.empty()) return std::make_shared(std::make_shared(), std::make_shared()); + transformInferredTypesIfNeeded(key_types, settings); + transformInferredTypesIfNeeded(value_types, settings); + auto key_least_supertype = tryGetLeastSupertype(key_types); auto value_least_supertype = tryGetLeastSupertype(value_types); @@ -398,9 +615,11 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) if (*buf.position() == '\'') { ++buf.position(); + String field; while (!buf.eof()) { char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end()); + field.append(buf.position(), next_pos); buf.position() = next_pos; if (!buf.hasPendingData()) @@ -409,6 +628,7 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) if (*buf.position() == '\'') break; + field.push_back(*buf.position()); if (*buf.position() == '\\') ++buf.position(); } @@ -417,6 +637,9 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) return nullptr; ++buf.position(); + if (auto type = tryInferDateOrDateTime(field, settings)) + return type; + return std::make_shared(); } @@ -430,15 +653,29 @@ static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf) /// Number Float64 tmp; + auto * pos_before_float = buf.position(); if (tryReadFloatText(tmp, buf)) + { + if (settings.try_infer_integers) + { + auto * float_end_pos = buf.position(); + buf.position() = pos_before_float; + Int64 tmp_int; + if (tryReadIntText(tmp_int, buf) && buf.position() == float_end_pos) + return std::make_shared(); + + buf.position() = float_end_pos; + } + return std::make_shared(); + } return nullptr; } -static DataTypePtr determineDataTypeForSingleField(ReadBuffer & buf) +static DataTypePtr determineDataTypeForSingleField(ReadBufferFromString & buf, const FormatSettings & settings) { - return makeNullableRecursivelyAndCheckForNothing(determineDataTypeForSingleFieldImpl(buf)); + return makeNullableRecursivelyAndCheckForNothing(determineDataTypeForSingleFieldImpl(buf, settings)); } DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule) @@ -448,11 +685,11 @@ DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSe case FormatSettings::EscapingRule::Quoted: { ReadBufferFromString buf(field); - auto type = determineDataTypeForSingleField(buf); + auto type = determineDataTypeForSingleField(buf, format_settings); return buf.eof() ? type : nullptr; } case FormatSettings::EscapingRule::JSON: - return JSONUtils::getDataTypeFromField(field); + return JSONUtils::getDataTypeFromField(field, format_settings); case FormatSettings::EscapingRule::CSV: { if (!format_settings.csv.input_format_use_best_effort_in_schema_inference) @@ -466,9 +703,13 @@ DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSe if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"'))) { - ReadBufferFromString buf(std::string_view(field.data() + 1, field.size() - 2)); + auto data = std::string_view(field.data() + 1, field.size() - 2); + if (auto date_type = tryInferDateOrDateTime(data, format_settings)) + return date_type; + + ReadBufferFromString buf(data); /// Try to determine the type of value inside quotes - auto type = determineDataTypeForSingleField(buf); + auto type = determineDataTypeForSingleField(buf, format_settings); if (!type) return nullptr; @@ -481,6 +722,14 @@ DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSe } /// Case when CSV value is not in quotes. Check if it's a number, and if not, determine it's as a string. + if (format_settings.try_infer_integers) + { + ReadBufferFromString buf(field); + Int64 tmp_int; + if (tryReadIntText(tmp_int, buf) && buf.eof()) + return makeNullable(std::make_shared()); + } + ReadBufferFromString buf(field); Float64 tmp; if (tryReadFloatText(tmp, buf) && buf.eof()) @@ -500,8 +749,11 @@ DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSe if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) return DataTypeFactory::instance().get("Nullable(Bool)"); + if (auto date_type = tryInferDateOrDateTime(field, format_settings)) + return date_type; + ReadBufferFromString buf(field); - auto type = determineDataTypeForSingleField(buf); + auto type = determineDataTypeForSingleField(buf, format_settings); if (!buf.eof()) return makeNullable(std::make_shared()); diff --git a/src/Formats/EscapingRuleUtils.h b/src/Formats/EscapingRuleUtils.h index 1ce04a8d1b7..ad4ce65a430 100644 --- a/src/Formats/EscapingRuleUtils.h +++ b/src/Formats/EscapingRuleUtils.h @@ -60,4 +60,21 @@ DataTypes determineDataTypesByEscapingRule(const std::vector & fields, c DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule); DataTypes getDefaultDataTypeForEscapingRules(const std::vector & escaping_rules); +/// Try to infer Date or Datetime from string if corresponding settings are enabled. +DataTypePtr tryInferDateOrDateTime(const std::string_view & field, const FormatSettings & settings); + +/// Check if we need to transform types inferred from data and transform it if necessary. +/// It's used when we try to infer some not ordinary types from another types. +/// For example dates from strings, we should check if dates were inferred from all strings +/// in the same way and if not, transform inferred dates back to strings. +/// For example, if we have array of strings and we tried to infer dates from them, +/// to make the result type Array(Date) we should ensure that all strings were +/// successfully parsed as dated and if not, convert all dates back to strings and make result type Array(String). +void transformInferredTypesIfNeeded(DataTypes & types, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule = FormatSettings::EscapingRule::Escaped); +void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule = FormatSettings::EscapingRule::Escaped); + +/// Same as transformInferredTypesIfNeeded but takes into account settings that are special for JSON formats. +void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set * numbers_parsed_from_json_strings = nullptr); +void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings); + } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 756b33d3eb2..a2cd921c8f5 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -94,6 +94,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers; format_settings.json.quote_denormals = settings.output_format_json_quote_denormals; format_settings.json.read_bools_as_numbers = settings.input_format_json_read_bools_as_numbers; + format_settings.json.try_infer_numbers_from_strings = settings.input_format_json_try_infer_numbers_from_strings; format_settings.null_as_default = settings.input_format_null_as_default; format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size; @@ -163,6 +164,9 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.sql_insert.table_name = settings.output_format_sql_insert_table_name; format_settings.sql_insert.use_replace = settings.output_format_sql_insert_use_replace; format_settings.sql_insert.quote_names = settings.output_format_sql_insert_quote_names; + format_settings.try_infer_integers = settings.input_format_try_infer_integers; + format_settings.try_infer_dates = settings.input_format_try_infer_dates; + format_settings.try_infer_datetimes = settings.input_format_try_infer_datetimes; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context if (format_settings.schema.is_server) diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 70bf8979383..eb619ddfd79 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -38,6 +38,9 @@ struct FormatSettings UInt64 max_rows_to_read_for_schema_inference = 100; String column_names_for_schema_inference; + bool try_infer_integers = false; + bool try_infer_dates = false; + bool try_infer_datetimes = false; enum class DateTimeInputFormat { @@ -142,6 +145,7 @@ struct FormatSettings bool named_tuples_as_objects = false; bool serialize_as_strings = false; bool read_bools_as_numbers = true; + bool try_infer_numbers_from_strings = false; } json; struct diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 1ac58760516..63c06a8615d 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -121,7 +122,7 @@ namespace JSONUtils } template - DataTypePtr getDataTypeFromFieldImpl(const Element & field) + DataTypePtr getDataTypeFromFieldImpl(const Element & field, const FormatSettings & settings, std::unordered_set & numbers_parsed_from_json_strings) { if (field.isNull()) return nullptr; @@ -129,11 +130,48 @@ namespace JSONUtils if (field.isBool()) return DataTypeFactory::instance().get("Nullable(Bool)"); - if (field.isInt64() || field.isUInt64() || field.isDouble()) + if (field.isInt64() || field.isUInt64()) + { + if (settings.try_infer_integers) + return makeNullable(std::make_shared()); + + return makeNullable(std::make_shared()); + } + + if (field.isDouble()) return makeNullable(std::make_shared()); if (field.isString()) + { + if (auto date_type = tryInferDateOrDateTime(field.getString(), settings)) + return date_type; + + if (!settings.json.try_infer_numbers_from_strings) + return makeNullable(std::make_shared()); + + ReadBufferFromString buf(field.getString()); + + if (settings.try_infer_integers) + { + Int64 tmp_int; + if (tryReadIntText(tmp_int, buf) && buf.eof()) + { + auto type = std::make_shared(); + numbers_parsed_from_json_strings.insert(type.get()); + return makeNullable(type); + } + } + + Float64 tmp; + if (tryReadFloatText(tmp, buf) && buf.eof()) + { + auto type = std::make_shared(); + numbers_parsed_from_json_strings.insert(type.get()); + return makeNullable(type); + } + return makeNullable(std::make_shared()); + } if (field.isArray()) { @@ -145,20 +183,32 @@ namespace JSONUtils DataTypes nested_data_types; /// If this array contains fields with different types we will treat it as Tuple. - bool is_tuple = false; + bool are_types_the_same = true; for (const auto element : array) { - auto type = getDataTypeFromFieldImpl(element); + auto type = getDataTypeFromFieldImpl(element, settings, numbers_parsed_from_json_strings); if (!type) return nullptr; - if (!nested_data_types.empty() && type->getName() != nested_data_types.back()->getName()) - is_tuple = true; + if (!nested_data_types.empty() && !type->equals(*nested_data_types.back())) + are_types_the_same = false; nested_data_types.push_back(std::move(type)); } - if (is_tuple) + if (!are_types_the_same) + { + auto nested_types_copy = nested_data_types; + transformInferredJSONTypesIfNeeded(nested_types_copy, settings, &numbers_parsed_from_json_strings); + are_types_the_same = true; + for (size_t i = 1; i < nested_types_copy.size(); ++i) + are_types_the_same &= nested_types_copy[i]->equals(*nested_types_copy[i - 1]); + + if (are_types_the_same) + nested_data_types = std::move(nested_types_copy); + } + + if (!are_types_the_same) return std::make_shared(nested_data_types); return std::make_shared(nested_data_types.back()); @@ -167,38 +217,35 @@ namespace JSONUtils if (field.isObject()) { auto object = field.getObject(); - DataTypePtr value_type; - bool is_object = false; + DataTypes value_types; + bool have_object_value = false; for (const auto key_value_pair : object) { - auto type = getDataTypeFromFieldImpl(key_value_pair.second); + auto type = getDataTypeFromFieldImpl(key_value_pair.second, settings, numbers_parsed_from_json_strings); if (!type) continue; if (isObject(type)) { - is_object = true; + have_object_value = true; break; } - if (!value_type) - { - value_type = type; - } - else if (!value_type->equals(*type)) - { - is_object = true; - break; - } + value_types.push_back(type); } - if (is_object) + if (value_types.empty()) + return nullptr; + + transformInferredJSONTypesIfNeeded(value_types, settings, &numbers_parsed_from_json_strings); + bool are_types_equal = true; + for (size_t i = 1; i < value_types.size(); ++i) + are_types_equal &= value_types[i]->equals(*value_types[0]); + + if (have_object_value || !are_types_equal) return std::make_shared("json", true); - if (value_type) - return std::make_shared(std::make_shared(), value_type); - - return nullptr; + return std::make_shared(std::make_shared(), value_types[0]); } throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; @@ -215,18 +262,19 @@ namespace JSONUtils #endif } - DataTypePtr getDataTypeFromField(const String & field) + DataTypePtr getDataTypeFromField(const String & field, const FormatSettings & settings) { auto [parser, element] = getJSONParserAndElement(); bool parsed = parser.parse(field, element); if (!parsed) throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object here: {}", field); - return getDataTypeFromFieldImpl(element); + std::unordered_set numbers_parsed_from_json_strings; + return getDataTypeFromFieldImpl(element, settings, numbers_parsed_from_json_strings); } template - static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, bool /*json_strings*/, Extractor & extractor) + static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, const FormatSettings & settings, bool /*json_strings*/, Extractor & extractor) { String line = readJSONEachRowLineIntoStringImpl(in); auto [parser, element] = getJSONParserAndElement(); @@ -238,8 +286,9 @@ namespace JSONUtils DataTypes data_types; data_types.reserve(fields.size()); + std::unordered_set numbers_parsed_from_json_strings; for (const auto & field : fields) - data_types.push_back(getDataTypeFromFieldImpl(field)); + data_types.push_back(getDataTypeFromFieldImpl(field, settings, numbers_parsed_from_json_strings)); /// TODO: For JSONStringsEachRow/JSONCompactStringsEach all types will be strings. /// Should we try to parse data inside strings somehow in this case? @@ -284,11 +333,11 @@ namespace JSONUtils std::vector column_names; }; - NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings) + NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings) { JSONEachRowFieldsExtractor extractor; auto data_types - = determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); + = determineColumnDataTypesFromJSONEachRowDataImpl(in, settings, json_strings, extractor); NamesAndTypesList result; for (size_t i = 0; i != extractor.column_names.size(); ++i) result.emplace_back(extractor.column_names[i], data_types[i]); @@ -313,10 +362,10 @@ namespace JSONUtils } }; - DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings) + DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings) { JSONCompactEachRowFieldsExtractor extractor; - return determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); + return determineColumnDataTypesFromJSONEachRowDataImpl(in, settings, json_strings, extractor); } diff --git a/src/Formats/JSONUtils.h b/src/Formats/JSONUtils.h index f2aba3cbcb5..b4ab6a29c93 100644 --- a/src/Formats/JSONUtils.h +++ b/src/Formats/JSONUtils.h @@ -22,16 +22,16 @@ namespace JSONUtils /// Parse JSON from string and convert it's type to ClickHouse type. Make the result type always Nullable. /// JSON array with different nested types is treated as Tuple. /// If cannot convert (for example when field contains null), return nullptr. - DataTypePtr getDataTypeFromField(const String & field); + DataTypePtr getDataTypeFromField(const String & field, const FormatSettings & settings); /// Read row in JSONEachRow format and try to determine type for each field. /// Return list of names and types. /// If cannot determine the type of some field, return nullptr for it. - NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings); + NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings); /// Read row in JSONCompactEachRow format and try to determine type for each field. /// If cannot determine the type of some field, return nullptr for it. - DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings); + DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings); bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf); diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 11a91bd50dc..b3934f422f8 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -75,8 +75,25 @@ ColumnsDescription readSchemaFromFormat( SchemaReaderPtr schema_reader; size_t max_rows_to_read = format_settings ? format_settings->max_rows_to_read_for_schema_inference : context->getSettingsRef().input_format_max_rows_to_read_for_schema_inference; size_t iterations = 0; - while ((buf = read_buffer_iterator())) + + while (true) { + try + { + buf = read_buffer_iterator(); + if (!buf) + break; + } + catch (...) + { + auto exception_message = getCurrentExceptionMessage(false); + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file:\n{}\nYou can specify the structure manually", + format_name, + exception_message); + } + ++iterations; if (buf->eof()) @@ -118,14 +135,14 @@ ColumnsDescription readSchemaFromFormat( } if (!retry || !isRetryableSchemaInferenceError(getCurrentExceptionCode())) - throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, exception_message); + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}\nYou can specify the structure manually", format_name, exception_message); exception_messages += "\n" + exception_message; } } if (names_and_types.empty()) - throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "All attempts to extract table structure from files failed. Errors:{}", exception_messages); + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "All attempts to extract table structure from files failed. Errors:{}\nYou can specify the structure manually", exception_messages); /// If we have "INSERT SELECT" query then try to order /// columns as they are ordered in table schema for formats diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 5a6ebf00660..3df9ea70e34 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -17,35 +18,38 @@ namespace ErrorCodes void chooseResultColumnType( DataTypePtr & type, - const DataTypePtr & new_type, - CommonDataTypeChecker common_type_checker, + DataTypePtr & new_type, + std::function transform_types_if_needed, const DataTypePtr & default_type, const String & column_name, size_t row) { if (!type) + { type = new_type; + return; + } + + if (!new_type || type->equals(*new_type)) + return; + + transform_types_if_needed(type, new_type); + if (type->equals(*new_type)) + return; /// If the new type and the previous type for this column are different, /// we will use default type if we have it or throw an exception. - if (new_type && !type->equals(*new_type)) + if (default_type) + type = default_type; + else { - DataTypePtr common_type; - if (common_type_checker) - common_type = common_type_checker(type, new_type); - - if (common_type) - type = common_type; - else if (default_type) - type = default_type; - else - throw Exception( - ErrorCodes::TYPE_MISMATCH, - "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", - type->getName(), - column_name, - row, - new_type->getName()); + throw Exception( + ErrorCodes::TYPE_MISMATCH, + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", + type->getName(), + column_name, + row, + new_type->getName()); } } @@ -63,8 +67,8 @@ void checkResultColumnTypeAndAppend(NamesAndTypesList & result, DataTypePtr & ty result.emplace_back(name, type); } -IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings) - : ISchemaReader(in_) +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : ISchemaReader(in_), format_settings(format_settings_) { if (!format_settings.column_names_for_schema_inference.empty()) { @@ -79,14 +83,14 @@ IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & form } } -IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_) - : IRowSchemaReader(in_, format_settings) +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_) + : IRowSchemaReader(in_, format_settings_) { default_type = default_type_; } -IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, const DataTypes & default_types_) - : IRowSchemaReader(in_, format_settings) +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, const DataTypes & default_types_) + : IRowSchemaReader(in_, format_settings_) { default_types = default_types_; } @@ -116,7 +120,8 @@ NamesAndTypesList IRowSchemaReader::readSchema() if (!new_data_types[i]) continue; - chooseResultColumnType(data_types[i], new_data_types[i], common_type_checker, getDefaultType(i), std::to_string(i + 1), rows_read); + auto transform_types_if_needed = [&](DataTypePtr & type, DataTypePtr & new_type){ transformTypesIfNeeded(type, new_type, i); }; + chooseResultColumnType(data_types[i], new_data_types[i], transform_types_if_needed, getDefaultType(i), std::to_string(i + 1), rows_read); } } @@ -156,8 +161,13 @@ DataTypePtr IRowSchemaReader::getDefaultType(size_t column) const return nullptr; } -IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, DataTypePtr default_type_) - : ISchemaReader(in_), default_type(default_type_) +void IRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) +{ + transformInferredTypesIfNeeded(type, new_type, format_settings); +} + +IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_) + : ISchemaReader(in_), format_settings(format_settings_), default_type(default_type_) { } @@ -181,6 +191,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() names_order.push_back(name); } + auto transform_types_if_needed = [&](DataTypePtr & type, DataTypePtr & new_type){ transformTypesIfNeeded(type, new_type); }; for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read) { auto new_names_and_types = readRowAndGetNamesAndDataTypes(eof); @@ -188,7 +199,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() /// We reached eof. break; - for (const auto & [name, new_type] : new_names_and_types) + for (auto & [name, new_type] : new_names_and_types) { auto it = names_to_types.find(name); /// If we didn't see this column before, just add it. @@ -200,7 +211,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() } auto & type = it->second; - chooseResultColumnType(type, new_type, common_type_checker, default_type, name, rows_read); + chooseResultColumnType(type, new_type, transform_types_if_needed, default_type, name, rows_read); } } @@ -219,4 +230,9 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() return result; } +void IRowWithNamesSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) +{ + transformInferredTypesIfNeeded(type, new_type, format_settings); +} + } diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 00987540d04..02c42495b2a 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -53,8 +53,6 @@ public: NamesAndTypesList readSchema() override; - void setCommonTypeChecker(CommonDataTypeChecker checker) { common_type_checker = checker; } - protected: /// Read one row and determine types of columns in it. /// Return types in the same order in which the values were in the row. @@ -67,6 +65,10 @@ protected: void setMaxRowsToRead(size_t max_rows) override { max_rows_to_read = max_rows; } size_t getNumRowsRead() const override { return rows_read; } + FormatSettings format_settings; + + virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t column_idx); + private: DataTypePtr getDefaultType(size_t column) const; @@ -74,7 +76,6 @@ private: size_t rows_read = 0; DataTypePtr default_type; DataTypes default_types; - CommonDataTypeChecker common_type_checker; std::vector column_names; }; @@ -86,12 +87,10 @@ private: class IRowWithNamesSchemaReader : public ISchemaReader { public: - IRowWithNamesSchemaReader(ReadBuffer & in_, DataTypePtr default_type_ = nullptr); + IRowWithNamesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_ = nullptr); NamesAndTypesList readSchema() override; bool hasStrictOrderOfColumns() const override { return false; } - void setCommonTypeChecker(CommonDataTypeChecker checker) { common_type_checker = checker; } - protected: /// Read one row and determine types of columns in it. /// Return list with names and types. @@ -102,11 +101,14 @@ protected: void setMaxRowsToRead(size_t max_rows) override { max_rows_to_read = max_rows; } size_t getNumRowsRead() const override { return rows_read; } + virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type); + + FormatSettings format_settings; + private: size_t max_rows_to_read; size_t rows_read = 0; DataTypePtr default_type; - CommonDataTypeChecker common_type_checker; }; /// Base class for schema inference for formats that don't need any data to @@ -122,8 +124,8 @@ public: void chooseResultColumnType( DataTypePtr & type, - const DataTypePtr & new_type, - CommonDataTypeChecker common_type_checker, + DataTypePtr & new_type, + std::function transform_types_if_needed, const DataTypePtr & default_type, const String & column_name, size_t row); diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 61488a94ccd..e5397ca0757 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -318,6 +318,11 @@ DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes() return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule()); } +void CustomSeparatedSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) +{ + transformInferredTypesIfNeeded(type, new_type, format_settings, reader.getEscapingRule()); +} + void registerInputFormatCustomSeparated(FormatFactory & factory) { for (bool ignore_spaces : {false, true}) diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index d9e62a1b8e9..c7e332b983f 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -97,6 +97,8 @@ public: private: DataTypes readRowAndGetDataTypes() override; + void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) override; + PeekableReadBuffer buf; CustomSeparatedFormatReader reader; bool first_row = true; diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp index cdde87f2cf6..7f9fbddd554 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -181,13 +182,14 @@ JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase( { } -void JSONColumnsSchemaReaderBase::chooseResulType(DataTypePtr & type, const DataTypePtr & new_type, const String & column_name, size_t row) const +void JSONColumnsSchemaReaderBase::chooseResulType(DataTypePtr & type, DataTypePtr & new_type, const String & column_name, size_t row) const { - auto common_type_checker = [&](const DataTypePtr & first, const DataTypePtr & second) + auto convert_types_if_needed = [&](DataTypePtr & first, DataTypePtr & second) { - return JSONUtils::getCommonTypeForJSONFormats(first, second, format_settings.json.read_bools_as_numbers); + DataTypes types = {first, second}; + transformInferredJSONTypesIfNeeded(types, format_settings); }; - chooseResultColumnType(type, new_type, common_type_checker, nullptr, column_name, row); + chooseResultColumnType(type, new_type, convert_types_if_needed, nullptr, column_name, row); } NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema() @@ -260,7 +262,7 @@ DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String & } readJSONField(field, in); - DataTypePtr field_type = JSONUtils::getDataTypeFromField(field); + DataTypePtr field_type = JSONUtils::getDataTypeFromField(field, format_settings); chooseResulType(column_type, field_type, column_name, rows_read); ++rows_read; } diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h index ac746a2e2d1..6769e60be22 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h @@ -83,7 +83,7 @@ private: DataTypePtr readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read); /// Choose result type for column from two inferred types from different rows. - void chooseResulType(DataTypePtr & type, const DataTypePtr & new_type, const String & column_name, size_t row) const; + void chooseResulType(DataTypePtr & type, DataTypePtr & new_type, const String & column_name, size_t row) const; const FormatSettings format_settings; std::unique_ptr reader; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 1bc5223a712..8ea379beae5 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -187,11 +188,6 @@ JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader( : FormatWithNamesAndTypesSchemaReader(in_, format_settings_, with_names_, with_types_, &reader) , reader(in_, yield_strings_, format_settings_) { - bool allow_bools_as_numbers = format_settings_.json.read_bools_as_numbers; - setCommonTypeChecker([allow_bools_as_numbers](const DataTypePtr & first, const DataTypePtr & second) - { - return JSONUtils::getCommonTypeForJSONFormats(first, second, allow_bools_as_numbers); - }); } DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes() @@ -210,7 +206,12 @@ DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes() if (in.eof()) return {}; - return JSONUtils::readRowAndGetDataTypesForJSONCompactEachRow(in, reader.yieldStrings()); + return JSONUtils::readRowAndGetDataTypesForJSONCompactEachRow(in, format_settings, reader.yieldStrings()); +} + +void JSONCompactEachRowRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) +{ + transformInferredJSONTypesIfNeeded(type, new_type, format_settings); } void registerInputFormatJSONCompactEachRow(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h index 79c76214774..7be9ba9289b 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h @@ -80,6 +80,8 @@ public: private: DataTypes readRowAndGetDataTypes() override; + void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) override; + JSONCompactEachRowFormatReader reader; bool first_row = true; }; diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 9eef72f95da..12415f897cb 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -306,18 +307,12 @@ void JSONEachRowRowInputFormat::readSuffix() assertEOF(*in); } -JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings_, const FormatSettings & format_settings) - : IRowWithNamesSchemaReader(in_) +JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings_, const FormatSettings & format_settings_) + : IRowWithNamesSchemaReader(in_, format_settings_) , json_strings(json_strings_) { - bool allow_bools_as_numbers = format_settings.json.read_bools_as_numbers; - setCommonTypeChecker([allow_bools_as_numbers](const DataTypePtr & first, const DataTypePtr & second) - { - return JSONUtils::getCommonTypeForJSONFormats(first, second, allow_bools_as_numbers); - }); } - NamesAndTypesList JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof) { if (first_row) @@ -350,7 +345,12 @@ NamesAndTypesList JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes(bool & return {}; } - return JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings); + return JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, format_settings, json_strings); +} + +void JSONEachRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) +{ + transformInferredJSONTypesIfNeeded(type, new_type, format_settings); } void registerInputFormatJSONEachRow(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index 1da14a532de..325bee2fcbb 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -92,6 +92,7 @@ public: private: NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override; + void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; bool json_strings; bool first_row = true; diff --git a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp index 8e787edf8ab..8e1beb8ec89 100644 --- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp @@ -402,7 +402,7 @@ void MySQLDumpRowInputFormat::skipField() } MySQLDumpSchemaReader::MySQLDumpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) - : IRowSchemaReader(in_, format_settings_), format_settings(format_settings_), table_name(format_settings_.mysql_dump.table_name) + : IRowSchemaReader(in_, format_settings_), table_name(format_settings_.mysql_dump.table_name) { } diff --git a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h index 2457f3d4762..6be20550e49 100644 --- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h +++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h @@ -35,7 +35,6 @@ private: NamesAndTypesList readSchema() override; DataTypes readRowAndGetDataTypes() override; - const FormatSettings format_settings; String table_name; }; diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index d92f65f33d1..c6150863bd4 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -133,7 +133,6 @@ RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & buf, format_settings_, getDefaultDataTypeForEscapingRule(format_settings_.regexp.escaping_rule)) - , format_settings(format_settings_) , field_extractor(format_settings) , buf(in_) { @@ -157,6 +156,12 @@ DataTypes RegexpSchemaReader::readRowAndGetDataTypes() return data_types; } +void RegexpSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) +{ + transformInferredTypesIfNeeded(type, new_type, format_settings, format_settings.regexp.escaping_rule); +} + + void registerInputFormatRegexp(FormatFactory & factory) { factory.registerInputFormat("Regexp", []( diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index 3cc6a3192fd..7fbb3fc320f 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -81,8 +81,10 @@ public: private: DataTypes readRowAndGetDataTypes() override; + void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) override; + + using EscapingRule = FormatSettings::EscapingRule; - const FormatSettings format_settings; RegexpFieldExtractor field_extractor; PeekableReadBuffer buf; }; diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index fe2c0c5ecdd..7393a1d6ce6 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -214,8 +214,7 @@ void TSKVRowInputFormat::resetParser() } TSKVSchemaReader::TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) - : IRowWithNamesSchemaReader(in_, getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::Escaped)) - , format_settings(format_settings_) + : IRowWithNamesSchemaReader(in_, format_settings_, getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::Escaped)) { } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.h b/src/Processors/Formats/Impl/TSKVRowInputFormat.h index bf8580bc6b7..5130ee5e827 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.h +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.h @@ -61,7 +61,6 @@ public: private: NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override; - const FormatSettings format_settings; bool first_row = true; }; diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index df4d49b172c..6e8bba89d8c 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -458,7 +458,6 @@ TemplateSchemaReader::TemplateSchemaReader( , buf(in_) , format(format_) , row_format(row_format_) - , format_settings(format_settings_) , format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings) { setColumnNames(row_format.column_names); @@ -494,6 +493,11 @@ DataTypes TemplateSchemaReader::readRowAndGetDataTypes() return data_types; } +void TemplateSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t column_idx) +{ + transformInferredTypesIfNeeded(type, new_type, format_settings, row_format.escaping_rules[column_idx]); +} + static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings) { ParsedTemplateFormatString resultset_format; diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index ab7043f057e..740683ad95d 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -121,10 +121,11 @@ public: DataTypes readRowAndGetDataTypes() override; private: + void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t column_idx) override; + PeekableReadBuffer buf; const ParsedTemplateFormatString format; const ParsedTemplateFormatString row_format; - FormatSettings format_settings; TemplateFormatReader format_reader; bool first_row = true; }; diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index 41f77f8bbf2..49b758b78c4 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -567,7 +567,7 @@ void ValuesBlockInputFormat::setReadBuffer(ReadBuffer & in_) } ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) - : IRowSchemaReader(buf, format_settings_), buf(in_), format_settings(format_settings_) + : IRowSchemaReader(buf, format_settings_), buf(in_) { } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index 9653e431b4e..bf243c54bd7 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -103,7 +103,6 @@ private: DataTypes readRowAndGetDataTypes() override; PeekableReadBuffer buf; - const FormatSettings format_settings; ParserExpression parser; bool first_row = true; bool end_of_data = false; diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index a3dcbe914bb..9ff227e5dab 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -229,12 +229,12 @@ void RowInputFormatWithNamesAndTypes::setReadBuffer(ReadBuffer & in_) FormatWithNamesAndTypesSchemaReader::FormatWithNamesAndTypesSchemaReader( ReadBuffer & in_, - const FormatSettings & format_settings, + const FormatSettings & format_settings_, bool with_names_, bool with_types_, FormatWithNamesAndTypesReader * format_reader_, DataTypePtr default_type_) - : IRowSchemaReader(in_, format_settings, default_type_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) + : IRowSchemaReader(in_, format_settings_, default_type_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) { } diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index 9fc8b2083df..e6a587b446c 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -123,7 +123,7 @@ class FormatWithNamesAndTypesSchemaReader : public IRowSchemaReader public: FormatWithNamesAndTypesSchemaReader( ReadBuffer & in, - const FormatSettings & format_settings, + const FormatSettings & format_settings_, bool with_names_, bool with_types_, FormatWithNamesAndTypesReader * format_reader_, @@ -141,5 +141,9 @@ private: FormatWithNamesAndTypesReader * format_reader; }; +/// [2, 2, 4, 0] -> [2, 4, 4, 0] -> [4, 4, 0] -> [4, 4, 0, 0] +/// [2, 4, 4, 2] -> [2, 8, 2, 0] +/// [2, 2, 4, 4] -> [2, 4, 4, 4] -> [4, 4, 4, 0], -> [4, 4, 8, 0] -> [4, 8, 0, 0] + } diff --git a/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference b/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference index a7609bdd86b..b6d10581b16 100644 --- a/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference +++ b/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference @@ -7,6 +7,12 @@ x Nullable(Float64) x Nullable(Float64) 1 0.42 +x Array(Nullable(Float64)) +[1,0] +[0.42] +x Array(Array(Nullable(Float64))) +[[1,2,3],[1,0],[1,1,0]] +[[1,2,3]] c1 Nullable(Bool) true false @@ -16,3 +22,9 @@ c1 Nullable(Float64) c1 Nullable(Float64) 1 0.42 +c1 Array(Nullable(Float64)) +[1,0] +[0.42] +c1 Array(Array(Nullable(Float64))) +[[1,2,3],[1,0],[1,1,0]] +[[1,2,3]] diff --git a/tests/queries/0_stateless/02247_read_bools_as_numbers_json.sh b/tests/queries/0_stateless/02247_read_bools_as_numbers_json.sh index 10f050ea6d1..1b689aaf577 100755 --- a/tests/queries/0_stateless/02247_read_bools_as_numbers_json.sh +++ b/tests/queries/0_stateless/02247_read_bools_as_numbers_json.sh @@ -27,6 +27,16 @@ echo -e '{"x" : true} $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" +echo -e '{"x" : [true, false]} +{"x" : [0.42]}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"x" : [[1, 2, 3], [true, false], [1, true, false]]} +{"x" : [[1, 2, 3]]}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + echo -e '[true] [false]' > $DATA_FILE @@ -43,5 +53,14 @@ echo -e '[true] $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" +echo -e '[[true, false]] +[[0.42]]' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo -e '[[[1, 2, 3], [true, false], [1, true, false]]] +[[[1, 2, 3]]]' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" rm $DATA_FILE diff --git a/tests/queries/0_stateless/02268_json_maps_and_objects.reference b/tests/queries/0_stateless/02268_json_maps_and_objects.reference index cfdc6e7e55c..73a8a8f43cf 100644 --- a/tests/queries/0_stateless/02268_json_maps_and_objects.reference +++ b/tests/queries/0_stateless/02268_json_maps_and_objects.reference @@ -1,2 +1,5 @@ x Object(Nullable(\'json\')) x Object(Nullable(\'json\')) +x Array(Object(Nullable(\'json\'))) +x Array(Object(Nullable(\'json\'))) +x Tuple(Map(String, Nullable(String)), Map(String, Array(Nullable(Float64))), Array(Nullable(Float64))) diff --git a/tests/queries/0_stateless/02268_json_maps_and_objects.sql b/tests/queries/0_stateless/02268_json_maps_and_objects.sql index 83d8fbaac2d..8a9ede6876c 100644 --- a/tests/queries/0_stateless/02268_json_maps_and_objects.sql +++ b/tests/queries/0_stateless/02268_json_maps_and_objects.sql @@ -1,3 +1,6 @@ -- Tags: no-fasttest desc format(JSONEachRow, '{"x" : {"a" : "Some string"}}, {"x" : {"b" : [1, 2, 3]}}, {"x" : {"c" : {"d" : 10}}}'); desc format(JSONEachRow, '{"x" : {"a" : "Some string"}}, {"x" : {"b" : [1, 2, 3], "c" : {"42" : 42}}}'); +desc format(JSONEachRow, '{"x" : [{"a" : "Some string"}]}, {"x" : [{"b" : [1, 2, 3]}]}'); +desc format(JSONEachRow, '{"x" : [{"a" : "Some string"}, {"b" : [1, 2, 3]}]}'); +desc format(JSONEachRow, '{"x" : [{"a" : "Some string"}, {"b" : [1, 2, 3]}, [1, 2, 3]]}'); diff --git a/tests/queries/0_stateless/02325_dates_schema_inference.reference b/tests/queries/0_stateless/02325_dates_schema_inference.reference new file mode 100644 index 00000000000..3ac4ad88f1c --- /dev/null +++ b/tests/queries/0_stateless/02325_dates_schema_inference.reference @@ -0,0 +1,60 @@ +JSONEachRow +x Nullable(Date) +x Nullable(DateTime64(9)) +x Nullable(DateTime64(9)) +x Array(Nullable(Date)) +x Array(Nullable(DateTime64(9))) +x Array(Nullable(DateTime64(9))) +x Map(String, Nullable(DateTime64(9))) +x Array(Nullable(DateTime64(9))) +x Array(Nullable(DateTime64(9))) +x Nullable(DateTime64(9)) +x Array(Nullable(String)) +x Nullable(String) +x Array(Nullable(String)) +x Map(String, Array(Array(Nullable(String)))) +CSV +c1 Nullable(Date) +c1 Nullable(DateTime64(9)) +c1 Nullable(DateTime64(9)) +c1 Array(Nullable(Date)) +c1 Array(Nullable(DateTime64(9))) +c1 Array(Nullable(DateTime64(9))) +c1 Map(String, Nullable(DateTime64(9))) +c1 Array(Nullable(DateTime64(9))) +c1 Array(Nullable(DateTime64(9))) +c1 Nullable(DateTime64(9)) +c1 Array(Nullable(String)) +c1 Nullable(String) +c1 Array(Nullable(String)) +c1 Map(String, Array(Array(Nullable(String)))) +TSV +c1 Nullable(Date) +c1 Nullable(DateTime64(9)) +c1 Nullable(DateTime64(9)) +c1 Array(Nullable(Date)) +c1 Array(Nullable(DateTime64(9))) +c1 Array(Nullable(DateTime64(9))) +c1 Map(String, Nullable(DateTime64(9))) +c1 Array(Nullable(DateTime64(9))) +c1 Array(Nullable(DateTime64(9))) +c1 Nullable(DateTime64(9)) +c1 Array(Nullable(String)) +c1 Nullable(String) +c1 Array(Nullable(String)) +c1 Map(String, Array(Array(Nullable(String)))) +Values +c1 Nullable(Date) +c1 Nullable(DateTime64(9)) +c1 Nullable(DateTime64(9)) +c1 Array(Nullable(Date)) +c1 Array(Nullable(DateTime64(9))) +c1 Array(Nullable(DateTime64(9))) +c1 Map(String, Nullable(DateTime64(9))) +c1 Array(Nullable(DateTime64(9))) +c1 Array(Nullable(DateTime64(9))) +c1 Nullable(DateTime64(9)) +c1 Array(Nullable(String)) +c1 Nullable(String) +c1 Array(Nullable(String)) +c1 Map(String, Array(Array(Nullable(String)))) diff --git a/tests/queries/0_stateless/02325_dates_schema_inference.sql b/tests/queries/0_stateless/02325_dates_schema_inference.sql new file mode 100644 index 00000000000..4527d4d32f5 --- /dev/null +++ b/tests/queries/0_stateless/02325_dates_schema_inference.sql @@ -0,0 +1,68 @@ +set input_format_try_infer_dates=1; +set input_format_try_infer_datetimes=1; + +select 'JSONEachRow'; +desc format(JSONEachRow, '{"x" : "2020-01-01"}'); +desc format(JSONEachRow, '{"x" : "2020-01-01 00:00:00.00000"}'); +desc format(JSONEachRow, '{"x" : "2020-01-01 00:00:00"}'); +desc format(JSONEachRow, '{"x" : ["2020-01-01", "2020-01-02"]}'); +desc format(JSONEachRow, '{"x" : ["2020-01-01", "2020-01-01 00:00:00"]}'); +desc format(JSONEachRow, '{"x" : ["2020-01-01 00:00:00", "2020-01-01 00:00:00"]}'); +desc format(JSONEachRow, '{"x" : {"date1" : "2020-01-01 00:00:00", "date2" : "2020-01-01"}}'); +desc format(JSONEachRow, '{"x" : ["2020-01-01 00:00:00", "2020-01-01"]}\n{"x" : ["2020-01-01"]}'); +desc format(JSONEachRow, '{"x" : ["2020-01-01 00:00:00"]}\n{"x" : ["2020-01-01"]}'); +desc format(JSONEachRow, '{"x" : "2020-01-01 00:00:00"}\n{"x" : "2020-01-01"}'); +desc format(JSONEachRow, '{"x" : ["2020-01-01 00:00:00", "Some string"]}'); +desc format(JSONEachRow, '{"x" : "2020-01-01 00:00:00"}\n{"x" : "Some string"}'); +desc format(JSONEachRow, '{"x" : ["2020-01-01 00:00:00", "2020-01-01"]}\n{"x" : ["2020-01-01", "Some string"]}'); +desc format(JSONEachRow, '{"x" : {"key1" : [["2020-01-01 00:00:00"]], "key2" : [["2020-01-01"]]}}\n{"x" : {"key1" : [["2020-01-01"]], "key2" : [["Some string"]]}}'); + +select 'CSV'; +desc format(CSV, '"2020-01-01"'); +desc format(CSV, '"2020-01-01 00:00:00.00000"'); +desc format(CSV, '"2020-01-01 00:00:00"'); +desc format(CSV, '"[\'2020-01-01\', \'2020-01-02\']"'); +desc format(CSV, '"[\'2020-01-01\', \'2020-01-01 00:00:00\']"'); +desc format(CSV, '"[\'2020-01-01 00:00:00\', \'2020-01-01 00:00:00\']"'); +desc format(CSV, '"{\'date1\' : \'2020-01-01 00:00:00\', \'date2\' : \'2020-01-01\'}"'); +desc format(CSV, '"[\'2020-01-01 00:00:00\', \'2020-01-01\']"\n"[\'2020-01-01\']"'); +desc format(CSV, '"[\'2020-01-01 00:00:00\']"\n"[\'2020-01-01\']"'); +desc format(CSV, '"2020-01-01 00:00:00"\n"2020-01-01"'); +desc format(CSV, '"[\'2020-01-01 00:00:00\', \'Some string\']"'); +desc format(CSV, '"2020-01-01 00:00:00"\n"Some string"'); +desc format(CSV, '"[\'2020-01-01 00:00:00\', \'2020-01-01\']"\n"[\'2020-01-01\', \'Some string\']"'); +desc format(CSV, '"{\'key1\' : [[\'2020-01-01 00:00:00\']], \'key2\' : [[\'2020-01-01\']]}"\n"{\'key1\' : [[\'2020-01-01\']], \'key2\' : [[\'Some string\']]}"'); + +select 'TSV'; +desc format(TSV, '2020-01-01'); +desc format(TSV, '2020-01-01 00:00:00.00000'); +desc format(TSV, '2020-01-01 00:00:00'); +desc format(TSV, '[\'2020-01-01\', \'2020-01-02\']'); +desc format(TSV, '[\'2020-01-01\', \'2020-01-01 00:00:00\']'); +desc format(TSV, '[\'2020-01-01 00:00:00\', \'2020-01-01 00:00:00\']'); +desc format(TSV, '{\'date1\' : \'2020-01-01 00:00:00\', \'date2\' : \'2020-01-01\'}'); +desc format(TSV, '[\'2020-01-01 00:00:00\', \'2020-01-01\']\n[\'2020-01-01\']'); +desc format(TSV, '[\'2020-01-01 00:00:00\']\n[\'2020-01-01\']'); +desc format(TSV, '2020-01-01 00:00:00\n2020-01-01'); +desc format(TSV, '[\'2020-01-01 00:00:00\', \'Some string\']'); +desc format(TSV, '2020-01-01 00:00:00\nSome string'); +desc format(TSV, '[\'2020-01-01 00:00:00\', \'2020-01-01\']\n[\'2020-01-01\', \'Some string\']'); +desc format(TSV, '{\'key1\' : [[\'2020-01-01 00:00:00\']], \'key2\' : [[\'2020-01-01\']]}\n{\'key1\' : [[\'2020-01-01\']], \'key2\' : [[\'Some string\']]}'); + +select 'Values'; +desc format(Values, '(\'2020-01-01\')'); +desc format(Values, '(\'2020-01-01 00:00:00.00000\')'); +desc format(Values, '(\'2020-01-01 00:00:00\')'); +desc format(Values, '([\'2020-01-01\', \'2020-01-02\'])'); +desc format(Values, '([\'2020-01-01\', \'2020-01-01 00:00:00\'])'); +desc format(Values, '([\'2020-01-01 00:00:00\', \'2020-01-01 00:00:00\'])'); +desc format(Values, '({\'date1\' : \'2020-01-01 00:00:00\', \'date2\' : \'2020-01-01\'})'); +desc format(Values, '([\'2020-01-01 00:00:00\', \'2020-01-01\'])\n([\'2020-01-01\'])'); +desc format(Values, '([\'2020-01-01 00:00:00\']), ([\'2020-01-01\'])'); +desc format(Values, '(\'2020-01-01 00:00:00\')\n(\'2020-01-01\')'); +desc format(Values, '([\'2020-01-01 00:00:00\', \'Some string\'])'); +desc format(Values, '(\'2020-01-01 00:00:00\')\n(\'Some string\')'); +desc format(Values, '([\'2020-01-01 00:00:00\', \'2020-01-01\'])\n([\'2020-01-01\', \'Some string\'])'); +desc format(Values, '({\'key1\' : [[\'2020-01-01 00:00:00\']], \'key2\' : [[\'2020-01-01\']]})\n({\'key1\' : [[\'2020-01-01\']], \'key2\' : [[\'Some string\']]})'); + + diff --git a/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.reference b/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.reference new file mode 100644 index 00000000000..2972dd92756 --- /dev/null +++ b/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.reference @@ -0,0 +1,17 @@ +x Nullable(Float64) +x Array(Nullable(Float64)) +x Map(String, Nullable(Float64)) +x Map(String, Array(Nullable(Float64))) +x Nullable(Float64) +x Array(Nullable(Float64)) +x Map(String, Nullable(Float64)) +x Map(String, Array(Nullable(Float64))) +x Array(Nullable(String)) +x Map(String, Nullable(String)) +x Map(String, Array(Nullable(String))) +x Nullable(String) +x Array(Nullable(String)) +x Map(String, Nullable(String)) +x Map(String, Array(Nullable(String))) +x Tuple(Nullable(Float64), Nullable(String)) +x Object(Nullable(\'json\')) diff --git a/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.sql b/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.sql new file mode 100644 index 00000000000..d94e9b2dc23 --- /dev/null +++ b/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.sql @@ -0,0 +1,19 @@ +set input_format_json_try_infer_numbers_from_strings=1; + +desc format(JSONEachRow, '{"x" : "123"}'); +desc format(JSONEachRow, '{"x" : ["123", 123, 12.3]}'); +desc format(JSONEachRow, '{"x" : {"k1" : "123", "k2" : 123}}'); +desc format(JSONEachRow, '{"x" : {"k1" : ["123", "123"], "k2" : [123, 123]}}'); +desc format(JSONEachRow, '{"x" : "123"}\n{"x" : 123}'); +desc format(JSONEachRow, '{"x" : ["123", "456"]}\n{"x" : [123, 456]}'); +desc format(JSONEachRow, '{"x" : {"k1" : "123"}}\n{"x" : {"k2" : 123}}'); +desc format(JSONEachRow, '{"x" : {"k1" : ["123", "123"]}}\n{"x": {"k2" : [123, 123]}}'); +desc format(JSONEachRow, '{"x" : ["123", "Some string"]}'); +desc format(JSONEachRow, '{"x" : {"k1" : "123", "k2" : "Some string"}}'); +desc format(JSONEachRow, '{"x" : {"k1" : ["123", "123"], "k2" : ["Some string"]}}'); +desc format(JSONEachRow, '{"x" : "123"}\n{"x" : "Some string"}'); +desc format(JSONEachRow, '{"x" : ["123", "456"]}\n{"x" : ["Some string"]}'); +desc format(JSONEachRow, '{"x" : {"k1" : "123"}}\n{"x" : {"k2" : "Some string"}}'); +desc format(JSONEachRow, '{"x" : {"k1" : ["123", "123"]}}\n{"x": {"k2" : ["Some string"]}}'); +desc format(JSONEachRow, '{"x" : [123, "Some string"]}'); +desc format(JSONEachRow, '{"x" : {"a" : 123, "b" : "Some string"}}'); diff --git a/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.reference b/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.reference new file mode 100644 index 00000000000..a1cb9f8e5dc --- /dev/null +++ b/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.reference @@ -0,0 +1,36 @@ +JSONEachRow +x Nullable(Int64) +x Array(Nullable(Int64)) +x Map(String, Array(Nullable(Int64))) +x Map(String, Array(Nullable(Int64))) +x Nullable(Float64) +x Nullable(Float64) +x Array(Nullable(Float64)) +x Map(String, Array(Nullable(Float64))) +CSV +c1 Nullable(Int64) +c1 Array(Nullable(Int64)) +c1 Map(String, Array(Nullable(Int64))) +c1 Map(String, Array(Nullable(Int64))) +c1 Nullable(Float64) +c1 Nullable(Float64) +c1 Array(Nullable(Float64)) +c1 Map(String, Array(Nullable(Float64))) +TSV +c1 Nullable(Int64) +c1 Array(Nullable(Int64)) +c1 Map(String, Array(Nullable(Int64))) +c1 Map(String, Array(Nullable(Int64))) +c1 Nullable(Float64) +c1 Nullable(Float64) +c1 Array(Nullable(Float64)) +c1 Map(String, Array(Nullable(Float64))) +Values +c1 Nullable(Int64) +c1 Array(Nullable(Int64)) +c1 Map(String, Array(Nullable(Int64))) +c1 Map(String, Array(Nullable(Int64))) +c1 Nullable(Float64) +c1 Nullable(Float64) +c1 Array(Nullable(Float64)) +c1 Map(String, Array(Nullable(Float64))) diff --git a/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql b/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql new file mode 100644 index 00000000000..6dc94a643a2 --- /dev/null +++ b/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql @@ -0,0 +1,43 @@ +set input_format_try_infer_integers=1; + +select 'JSONEachRow'; +desc format(JSONEachRow, '{"x" : 123}'); +desc format(JSONEachRow, '{"x" : [123, 123]}'); +desc format(JSONEachRow, '{"x" : {"a" : [123, 123]}}'); +desc format(JSONEachRow, '{"x" : {"a" : [123, 123]}}\n{"x" : {"b" : [321, 321]}}'); +desc format(JSONEachRow, '{"x" : 123}\n{"x" : 123.123}'); +desc format(JSONEachRow, '{"x" : 123}\n{"x" : 1e2}'); +desc format(JSONEachRow, '{"x" : [123, 123]}\n{"x" : [321.321, 312]}'); +desc format(JSONEachRow, '{"x" : {"a" : [123, 123]}}\n{"x" : {"b" : [321.321, 123]}}'); + +select 'CSV'; +desc format(CSV, '123'); +desc format(CSV, '"[123, 123]"'); +desc format(CSV, '"{\'a\' : [123, 123]}"'); +desc format(CSV, '"{\'a\' : [123, 123]}"\n"{\'b\' : [321, 321]}"'); +desc format(CSV, '123\n123.123'); +desc format(CSV, '122\n1e2'); +desc format(CSV, '"[123, 123]"\n"[321.321, 312]"'); +desc format(CSV, '"{\'a\' : [123, 123]}"\n"{\'b\' : [321.321, 123]}"'); + +select 'TSV'; +desc format(TSV, '123'); +desc format(TSV, '[123, 123]'); +desc format(TSV, '{\'a\' : [123, 123]}'); +desc format(TSV, '{\'a\' : [123, 123]}\n{\'b\' : [321, 321]}'); +desc format(TSV, '123\n123.123'); +desc format(TSV, '122\n1e2'); +desc format(TSV, '[123, 123]\n[321.321, 312]'); +desc format(TSV, '{\'a\' : [123, 123]}\n{\'b\' : [321.321, 123]}'); + +select 'Values'; +desc format(Values, '(123)'); +desc format(Values, '([123, 123])'); +desc format(Values, '({\'a\' : [123, 123]})'); +desc format(Values, '({\'a\' : [123, 123]}), ({\'b\' : [321, 321]})'); +desc format(Values, '(123), (123.123)'); +desc format(Values, '(122), (1e2)'); +desc format(Values, '([123, 123])\n([321.321, 312])'); +desc format(Values, '({\'a\' : [123, 123]}), ({\'b\' : [321.321, 123]})'); + + From 2b7c6b7ecd1574cf87eff358daedc2025ada45c0 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 13 Jul 2022 15:59:04 +0000 Subject: [PATCH 004/164] Remove logging --- src/Formats/EscapingRuleUtils.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 0ae7918f682..3aceaeeff1b 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -397,8 +397,6 @@ void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings else { are_maps_equal &= type->equals(*first_map_type); - if (!type->equals(*first_map_type)) - LOG_DEBUG(&Poco::Logger::get("SchemaInference"), "Maps {} and {} are different", type->getName(), first_map_type->getName()); } } else if (isObject(type)) From 266039ea646d1519e81c5fae786f4022dc168b59 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 14 Jul 2022 19:00:17 +0000 Subject: [PATCH 005/164] Correct gTests for DateLUT --- src/Common/DateLUTImpl.h | 2 +- src/Common/tests/gtest_DateLUTImpl.cpp | 9 +++++---- src/Functions/FunctionsConversion.h | 6 ++++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index 209afc9e6f0..c8a0e149515 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -80,7 +80,7 @@ private: static inline LUTIndex normalizeLUTIndex(Int64 index) { - if (index < 0 ) + if unlikely(index < 0 ) return LUTIndex(0); if (index >= DATE_LUT_SIZE) LUTIndex(DATE_LUT_SIZE - 1); diff --git a/src/Common/tests/gtest_DateLUTImpl.cpp b/src/Common/tests/gtest_DateLUTImpl.cpp index d522448d337..2584d48a8d6 100644 --- a/src/Common/tests/gtest_DateLUTImpl.cpp +++ b/src/Common/tests/gtest_DateLUTImpl.cpp @@ -79,12 +79,13 @@ FailuresCount countFailures(const ::testing::TestResult & test_result) TEST(DateLUTTest, makeDayNumTest) { const DateLUTImpl & lut = DateLUT::instance("UTC"); - EXPECT_EQ(0, lut.makeDayNum(1924, 12, 31)); - EXPECT_EQ(-1, lut.makeDayNum(1924, 12, 31, -1)); + EXPECT_EQ(0, lut.makeDayNum(1899, 12, 31)); + EXPECT_EQ(-1, lut.makeDayNum(1899, 12, 31, -1)); + EXPECT_EQ(-25567, lut.makeDayNum(1900, 1, 1)); EXPECT_EQ(-16436, lut.makeDayNum(1925, 1, 1)); EXPECT_EQ(0, lut.makeDayNum(1970, 1, 1)); - EXPECT_EQ(114635, lut.makeDayNum(2283, 11, 11)); - EXPECT_EQ(114635, lut.makeDayNum(2500, 12, 25)); + EXPECT_EQ(120894, lut.makeDayNum(2399, 12, 31)); + EXPECT_EQ(120894, lut.makeDayNum(2500, 12, 25)); } diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index e0c42401207..9eac9945cb8 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -536,8 +536,10 @@ template struct ConvertImpl struct ConvertImpl : DateTimeTransformImpl> {}; -const time_t LUT_MIN_TIME = -1420070400l; // 1925-01-01 UTC -const time_t LUT_MAX_TIME = 9877248000l; // 2282-12-31 UTC + +const time_t LUT_MIN_TIME = -2208988800l; // 1900-01-01 UTC + +const time_t LUT_MAX_TIME = 10413792000l; // 2300-12-31 UTC /** Conversion of numeric to DateTime64 */ From 1d0818d9cf92ae3496b3e977a7fe306fea7772d3 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 15 Jul 2022 10:33:52 +0000 Subject: [PATCH 006/164] Set max year to 2299; Code cleanup; Make working 02245_make_datetime64 test --- src/Common/DateLUTImpl.h | 21 +------ src/Common/tests/gtest_DateLUTImpl.cpp | 2 +- src/Functions/FunctionsConversion.h | 2 +- src/Functions/makeDate.cpp | 5 +- .../02245_make_datetime64.reference | 56 +++++++++---------- .../0_stateless/02245_make_datetime64.sql | 8 +-- 6 files changed, 39 insertions(+), 55 deletions(-) diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index c8a0e149515..98df35b2dfa 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -11,13 +11,10 @@ #define DATE_LUT_MIN_YEAR 1900 /// 1900 since majority of financial organizations consider 1900 as an initial year. -// #define DATE_LUT_MAX_YEAR 2258 /// Last supported year (complete) -#define DATE_LUT_MAX_YEAR 2300 /// Last supported year (complete) +#define DATE_LUT_MAX_YEAR 2299 /// Last supported year (complete) #define DATE_LUT_YEARS (1 + DATE_LUT_MAX_YEAR - DATE_LUT_MIN_YEAR) /// Number of years in lookup table -// #define DATE_LUT_SIZE 0x20000 -#define DATE_LUT_SIZE 0x23C1E - +#define DATE_LUT_SIZE 0x23AB1 #define DATE_LUT_MAX (0xFFFFFFFFU - 86400) #define DATE_LUT_MAX_DAY_NUM 0xFFFF @@ -91,68 +88,58 @@ private: friend inline LUTIndex operator+(const LUTIndex & index, const T v) { return normalizeLUTIndex(index.toUnderType() + UInt32(v)); - //return LUTIndex{(index.toUnderType() + UInt32(v)) & date_lut_mask}; } template friend inline LUTIndex operator+(const T v, const LUTIndex & index) { return normalizeLUTIndex(v + index.toUnderType()); - //return LUTIndex{(v + index.toUnderType()) & date_lut_mask}; } friend inline LUTIndex operator+(const LUTIndex & index, const LUTIndex & v) { return normalizeLUTIndex(static_cast(index.toUnderType() + v.toUnderType())); - //return LUTIndex{(index.toUnderType() + v.toUnderType()) & date_lut_mask}; } template friend inline LUTIndex operator-(const LUTIndex & index, const T v) { return normalizeLUTIndex(static_cast(index.toUnderType() - UInt32(v))); - //return LUTIndex{(index.toUnderType() - UInt32(v)) & date_lut_mask}; } template friend inline LUTIndex operator-(const T v, const LUTIndex & index) { return normalizeLUTIndex(static_cast(v - index.toUnderType())); - //return LUTIndex{(v - index.toUnderType()) & date_lut_mask}; } friend inline LUTIndex operator-(const LUTIndex & index, const LUTIndex & v) { return normalizeLUTIndex(static_cast(index.toUnderType() - v.toUnderType())); - //return LUTIndex{(index.toUnderType() - v.toUnderType()) & date_lut_mask}; } template friend inline LUTIndex operator*(const LUTIndex & index, const T v) { return normalizeLUTIndex(index.toUnderType() * UInt32(v)); - // return LUTIndex{(index.toUnderType() * UInt32(v)) /*& date_lut_mask*/}; } template friend inline LUTIndex operator*(const T v, const LUTIndex & index) { return normalizeLUTIndex(v * index.toUnderType()); - // return LUTIndex{(v * index.toUnderType()) /*& date_lut_mask*/}; } template friend inline LUTIndex operator/(const LUTIndex & index, const T v) { return normalizeLUTIndex(index.toUnderType() / UInt32(v)); - // return LUTIndex{(index.toUnderType() / UInt32(v)) /*& date_lut_mask*/}; } template friend inline LUTIndex operator/(const T v, const LUTIndex & index) { return normalizeLUTIndex(UInt32(v) / index.toUnderType()); - // return LUTIndex{(UInt32(v) / index.toUnderType()) /*& date_lut_mask*/}; } public: @@ -267,13 +254,11 @@ private: static inline LUTIndex toLUTIndex(DayNum d) { return normalizeLUTIndex(d + daynum_offset_epoch); - // return LUTIndex{(d + daynum_offset_epoch) /*& date_lut_mask*/}; } static inline LUTIndex toLUTIndex(ExtendedDayNum d) { return normalizeLUTIndex(static_cast(d + daynum_offset_epoch)); - // return LUTIndex{static_cast(d + daynum_offset_epoch) /*& date_lut_mask*/}; } inline LUTIndex toLUTIndex(Time t) const @@ -1098,7 +1083,7 @@ public: auto year_lut_index = (year - DATE_LUT_MIN_YEAR) * 12 + month - 1; UInt32 index = years_months_lut[year_lut_index].toUnderType() + day_of_month - 1; - /// When date is out of range, default value is DATE_LUT_SIZE - 1 (2283-11-11) + /// When date is out of range, default value is DATE_LUT_SIZE - 1 (2299-12-31) return LUTIndex{std::min(index, static_cast(DATE_LUT_SIZE - 1))}; } diff --git a/src/Common/tests/gtest_DateLUTImpl.cpp b/src/Common/tests/gtest_DateLUTImpl.cpp index 2584d48a8d6..95cad92efca 100644 --- a/src/Common/tests/gtest_DateLUTImpl.cpp +++ b/src/Common/tests/gtest_DateLUTImpl.cpp @@ -84,7 +84,7 @@ TEST(DateLUTTest, makeDayNumTest) EXPECT_EQ(-25567, lut.makeDayNum(1900, 1, 1)); EXPECT_EQ(-16436, lut.makeDayNum(1925, 1, 1)); EXPECT_EQ(0, lut.makeDayNum(1970, 1, 1)); - EXPECT_EQ(120894, lut.makeDayNum(2399, 12, 31)); + EXPECT_EQ(120894, lut.makeDayNum(2300, 12, 31)); EXPECT_EQ(120894, lut.makeDayNum(2500, 12, 25)); } diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 9eac9945cb8..0b0a4e9f21b 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -539,7 +539,7 @@ template struct ConvertImpl Date: Fri, 15 Jul 2022 12:58:08 +0000 Subject: [PATCH 007/164] Correct 02243_make_date32 test --- .../0_stateless/02243_make_date32.reference | 7 +++---- tests/queries/0_stateless/02243_make_date32.sql | 15 +++++++-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/queries/0_stateless/02243_make_date32.reference b/tests/queries/0_stateless/02243_make_date32.reference index ac4b10d371a..b986cd285f4 100644 --- a/tests/queries/0_stateless/02243_make_date32.reference +++ b/tests/queries/0_stateless/02243_make_date32.reference @@ -28,10 +28,9 @@ Nullable(Date32) 1969-01-01 1969-12-01 1969-12-31 -2282-01-01 -2283-01-01 -2283-11-11 -1970-01-01 +2298-01-01 +2299-01-01 +2299-12-31 1970-01-01 1970-01-01 1970-01-01 diff --git a/tests/queries/0_stateless/02243_make_date32.sql b/tests/queries/0_stateless/02243_make_date32.sql index c01855546c5..2cf4ac6b358 100644 --- a/tests/queries/0_stateless/02243_make_date32.sql +++ b/tests/queries/0_stateless/02243_make_date32.sql @@ -39,14 +39,13 @@ select makeDate32(2150,1,1); select makeDate32(1969,1,1); select makeDate32(1969,12,1); select makeDate32(1969,12,31); -select makeDate32(2282,1,1); -select makeDate32(2283,1,1); -select makeDate32(2283,11,11); -select makeDate32(2283,11,12); -select makeDate32(2284,1,1); -select makeDate32(1924,1,1); -select makeDate32(1924,12,1); -select makeDate32(1924,12,31); +select makeDate32(2298,1,1); +select makeDate32(2299,1,1); +select makeDate32(2299,12,31); +select makeDate32(2300,1,1); +select makeDate32(1899,1,1); +select makeDate32(1899,12,1); +select makeDate32(1899,12,31); select makeDate32(1970,0,0); select makeDate32(1970,0,1); select makeDate32(1970,1,0); From 03194eaeb92c6225112db68b26e24b2f03a8116d Mon Sep 17 00:00:00 2001 From: root Date: Fri, 15 Jul 2022 10:01:30 -0700 Subject: [PATCH 008/164] Feature - Structured Logging Support resubmit --- programs/server/config.xml | 9 + src/Daemon/BaseDaemon.cpp | 204 +++++++++--------- src/Loggers/Loggers.cpp | 105 ++++++--- src/Loggers/OwnFormattingChannel.cpp | 12 +- src/Loggers/OwnFormattingChannel.h | 8 + src/Loggers/OwnJSONPatternFormatter.cpp | 123 +++++++++++ src/Loggers/OwnJSONPatternFormatter.h | 31 +++ .../test_structured_logging_json/__init__.py | 0 .../test_structured_logging_json/test.py | 57 +++++ 9 files changed, 415 insertions(+), 134 deletions(-) create mode 100644 src/Loggers/OwnJSONPatternFormatter.cpp create mode 100644 src/Loggers/OwnJSONPatternFormatter.h create mode 100644 tests/integration/test_structured_logging_json/__init__.py create mode 100644 tests/integration/test_structured_logging_json/test.py diff --git a/programs/server/config.xml b/programs/server/config.xml index 203684a9e00..f8e7fa9d8cf 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -60,6 +60,15 @@ --> + + true diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 23835df87ea..f9b627aaf79 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -1,13 +1,14 @@ #ifdef HAS_RESERVED_IDENTIFIER -#pragma clang diagnostic ignored "-Wreserved-identifier" +# pragma clang diagnostic ignored "-Wreserved-identifier" #endif #include #include +#include #include -#include #include +#include #include #include #if defined(OS_LINUX) @@ -18,51 +19,52 @@ #include #include -#include -#include #include +#include #include +#include #include +#include +#include #include #include -#include -#include -#include #include -#include #include +#include #include +#include -#include -#include +#include #include #include +#include +#include #include +#include +#include +#include #include #include #include -#include -#include -#include #include #include #include -#include -#include +#include #include -#include +#include #include +#include #include #include #if defined(OS_DARWIN) -# pragma GCC diagnostic ignored "-Wunused-macros" +# pragma GCC diagnostic ignored "-Wunused-macros" // NOLINTNEXTLINE(bugprone-reserved-identifier) -# define _XOPEN_SOURCE 700 // ucontext is not available without _XOPEN_SOURCE +# define _XOPEN_SOURCE 700 // ucontext is not available without _XOPEN_SOURCE #endif #include @@ -92,19 +94,14 @@ static void call_default_signal_handler(int sig) DB::throwFromErrno("Cannot send signal.", DB::ErrorCodes::CANNOT_SEND_SIGNAL); } -static const size_t signal_pipe_buf_size = - sizeof(int) - + sizeof(siginfo_t) - + sizeof(ucontext_t*) - + sizeof(StackTrace) - + sizeof(UInt32) - + sizeof(void*); +static const size_t signal_pipe_buf_size + = sizeof(int) + sizeof(siginfo_t) + sizeof(ucontext_t *) + sizeof(StackTrace) + sizeof(UInt32) + sizeof(void *); -using signal_function = void(int, siginfo_t*, void*); +using signal_function = void(int, siginfo_t *, void *); static void writeSignalIDtoSignalPipe(int sig) { - auto saved_errno = errno; /// We must restore previous value of errno in signal handler. + auto saved_errno = errno; /// We must restore previous value of errno in signal handler. char buf[signal_pipe_buf_size]; DB::WriteBufferFromFileDescriptor out(signal_pipe.fds_rw[1], signal_pipe_buf_size, buf); @@ -133,7 +130,7 @@ static void terminateRequestedSignalHandler(int sig, siginfo_t *, void *) static void signalHandler(int sig, siginfo_t * info, void * context) { DENY_ALLOCATIONS_IN_SCOPE; - auto saved_errno = errno; /// We must restore previous value of errno in signal handler. + auto saved_errno = errno; /// We must restore previous value of errno in signal handler. char buf[signal_pipe_buf_size]; DB::WriteBufferFromFileDescriptorDiscardOnFailure out(signal_pipe.fds_rw[1], signal_pipe_buf_size, buf); @@ -153,7 +150,7 @@ static void signalHandler(int sig, siginfo_t * info, void * context) if (sig != SIGTSTP) /// This signal is used for debugging. { /// The time that is usually enough for separate thread to print info into log. - sleepForSeconds(20); /// FIXME: use some feedback from threads that process stacktrace + sleepForSeconds(20); /// FIXME: use some feedback from threads that process stacktrace call_default_signal_handler(sig); } @@ -162,8 +159,7 @@ static void signalHandler(int sig, siginfo_t * info, void * context) /// Avoid link time dependency on DB/Interpreters - will use this function only when linked. -__attribute__((__weak__)) void collectCrashLog( - Int32 signal, UInt64 thread_id, const String & query_id, const StackTrace & stack_trace); +__attribute__((__weak__)) void collectCrashLog(Int32 signal, UInt64 thread_id, const String & query_id, const StackTrace & stack_trace); /** The thread that read info about signal or std::terminate from pipe. @@ -181,16 +177,14 @@ public: SanitizerTrap = -3, }; - explicit SignalListener(BaseDaemon & daemon_) - : log(&Poco::Logger::get("BaseDaemon")) - , daemon(daemon_) - { - } + explicit SignalListener(BaseDaemon & daemon_) : log(&Poco::Logger::get("BaseDaemon")), daemon(daemon_) { } void run() override { static_assert(PIPE_BUF >= 512); - static_assert(signal_pipe_buf_size <= PIPE_BUF, "Only write of PIPE_BUF to pipe is atomic and the minimal known PIPE_BUF across supported platforms is 512"); + static_assert( + signal_pipe_buf_size <= PIPE_BUF, + "Only write of PIPE_BUF to pipe is atomic and the minimal known PIPE_BUF across supported platforms is 512"); char buf[signal_pipe_buf_size]; DB::ReadBufferFromFileDescriptor in(signal_pipe.fds_rw[0], signal_pipe_buf_size, buf); @@ -225,9 +219,7 @@ public: onTerminate(message, thread_num); } - else if (sig == SIGINT || - sig == SIGQUIT || - sig == SIGTERM) + else if (sig == SIGINT || sig == SIGQUIT || sig == SIGTERM) { daemon.handleSignal(sig); } @@ -264,8 +256,14 @@ private: { size_t pos = message.find('\n'); - LOG_FATAL(log, "(version {}{}, {}) (from thread {}) {}", - VERSION_STRING, VERSION_OFFICIAL, daemon.build_id_info, thread_num, message.substr(0, pos)); + LOG_FATAL( + log, + "(version {}{}, {}) (from thread {}) {}", + VERSION_STRING, + VERSION_OFFICIAL, + daemon.build_id_info, + thread_num, + message.substr(0, pos)); /// Print trace from std::terminate exception line-by-line to make it easy for grep. while (pos != std::string_view::npos) @@ -313,15 +311,29 @@ private: if (query_id.empty()) { - LOG_FATAL(log, "(version {}{}, {}) (from thread {}) (no query) Received signal {} ({})", - VERSION_STRING, VERSION_OFFICIAL, daemon.build_id_info, - thread_num, strsignal(sig), sig); + LOG_FATAL( + log, + "(version {}{}, {}) (from thread {}) (no query) Received signal {} ({})", + VERSION_STRING, + VERSION_OFFICIAL, + daemon.build_id_info, + thread_num, + strsignal(sig), + sig); } else { - LOG_FATAL(log, "(version {}{}, {}) (from thread {}) (query_id: {}) (query: {}) Received signal {} ({})", - VERSION_STRING, VERSION_OFFICIAL, daemon.build_id_info, - thread_num, query_id, query, strsignal(sig), sig); + LOG_FATAL( + log, + "(version {}{}, {}) (from thread {}) (query_id: {}) (query: {}) Received signal {} ({})", + VERSION_STRING, + VERSION_OFFICIAL, + daemon.build_id_info, + thread_num, + query_id, + query, + strsignal(sig), + sig); } String error_message; @@ -395,12 +407,7 @@ private: #if defined(SANITIZER) extern "C" void __sanitizer_set_death_callback(void (*)()); -/// Sanitizers may not expect some function calls from death callback. -/// Let's try to disable instrumentation to avoid possible issues. -/// However, this callback may call other functions that are still instrumented. -/// We can try [[clang::always_inline]] attribute for statements in future (available in clang-15) -/// See https://github.com/google/sanitizers/issues/1543 and https://github.com/google/sanitizers/issues/1549. -static DISABLE_SANITIZER_INSTRUMENTATION void sanitizerDeathCallback() +static void sanitizerDeathCallback() { DENY_ALLOCATIONS_IN_SCOPE; /// Also need to send data via pipe. Otherwise it may lead to deadlocks or failures in printing diagnostic info. @@ -608,7 +615,9 @@ void debugIncreaseOOMScore() LOG_INFO(&Poco::Logger::root(), "Set OOM score adjustment to {}", new_score); } #else -void debugIncreaseOOMScore() {} +void debugIncreaseOOMScore() +{ +} #endif } @@ -731,14 +740,12 @@ void BaseDaemon::initialize(Application & self) if (!log_path.empty()) { std::string path = createDirectory(log_path); - if (is_daemon - && chdir(path.c_str()) != 0) + if (is_daemon && chdir(path.c_str()) != 0) throw Poco::Exception("Cannot change directory to " + path); } else { - if (is_daemon - && chdir("/tmp") != 0) + if (is_daemon && chdir("/tmp") != 0) throw Poco::Exception("Cannot change directory to /tmp"); } @@ -885,50 +892,40 @@ void BaseDaemon::initializeTerminationAndSignalProcessing() void BaseDaemon::logRevision() const { - Poco::Logger::root().information("Starting " + std::string{VERSION_FULL} - + " with revision " + std::to_string(ClickHouseRevision::getVersionRevision()) - + ", " + build_id_info - + ", PID " + std::to_string(getpid())); + Poco::Logger::root().information( + "Starting " + std::string{VERSION_FULL} + " with revision " + std::to_string(ClickHouseRevision::getVersionRevision()) + ", " + + build_id_info + ", PID " + std::to_string(getpid())); } void BaseDaemon::defineOptions(Poco::Util::OptionSet & new_options) { - new_options.addOption( - Poco::Util::Option("config-file", "C", "load configuration from a given file") - .required(false) - .repeatable(false) - .argument("") - .binding("config-file")); + new_options.addOption(Poco::Util::Option("config-file", "C", "load configuration from a given file") + .required(false) + .repeatable(false) + .argument("") + .binding("config-file")); + + new_options.addOption(Poco::Util::Option("log-file", "L", "use given log file") + .required(false) + .repeatable(false) + .argument("") + .binding("logger.log")); + + new_options.addOption(Poco::Util::Option("errorlog-file", "E", "use given log file for errors only") + .required(false) + .repeatable(false) + .argument("") + .binding("logger.errorlog")); new_options.addOption( - Poco::Util::Option("log-file", "L", "use given log file") - .required(false) - .repeatable(false) - .argument("") - .binding("logger.log")); - - new_options.addOption( - Poco::Util::Option("errorlog-file", "E", "use given log file for errors only") - .required(false) - .repeatable(false) - .argument("") - .binding("logger.errorlog")); - - new_options.addOption( - Poco::Util::Option("pid-file", "P", "use given pidfile") - .required(false) - .repeatable(false) - .argument("") - .binding("pid")); + Poco::Util::Option("pid-file", "P", "use given pidfile").required(false).repeatable(false).argument("").binding("pid")); Poco::Util::ServerApplication::defineOptions(new_options); } void BaseDaemon::handleSignal(int signal_id) { - if (signal_id == SIGINT || - signal_id == SIGQUIT || - signal_id == SIGTERM) + if (signal_id == SIGINT || signal_id == SIGQUIT || signal_id == SIGTERM) { std::lock_guard lock(signal_handler_mutex); { @@ -962,7 +959,7 @@ void BaseDaemon::waitForTerminationRequest() { /// NOTE: as we already process signals via pipe, we don't have to block them with sigprocmask in threads std::unique_lock lock(signal_handler_mutex); - signal_event.wait(lock, [this](){ return terminate_signals_counter > 0; }); + signal_event.wait(lock, [this]() { return terminate_signals_counter > 0; }); } @@ -1001,7 +998,7 @@ void BaseDaemon::setupWatchdog() } /// Change short thread name and process name. - setThreadName("clckhouse-watch"); /// 15 characters + setThreadName("clckhouse-watch"); /// 15 characters if (argv0) { @@ -1013,9 +1010,18 @@ void BaseDaemon::setupWatchdog() /// If streaming compression of logs is used then we write watchdog logs to cerr if (config().getRawString("logger.stream_compress", "false") == "true") { - Poco::AutoPtr pf = new OwnPatternFormatter; - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); - logger().setChannel(log); + if (config().has("logger.json")) + { + Poco::AutoPtr pf = new OwnJSONPatternFormatter; + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); + logger().setChannel(log); + } + else + { + Poco::AutoPtr pf = new OwnPatternFormatter; + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); + logger().setChannel(log); + } } logger().information(fmt::format("Will watch for the process with pid {}", pid)); @@ -1073,9 +1079,11 @@ void BaseDaemon::setupWatchdog() if (sig == SIGKILL) { - logger().fatal(fmt::format("Child process was terminated by signal {} (KILL)." + logger().fatal(fmt::format( + "Child process was terminated by signal {} (KILL)." " If it is not done by 'forcestop' command or manually," - " the possible cause is OOM Killer (see 'dmesg' and look at the '/var/log/kern.log' for the details).", sig)); + " the possible cause is OOM Killer (see 'dmesg' and look at the '/var/log/kern.log' for the details).", + sig)); } else { diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index 70205998bb5..fdcd75f761c 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -1,17 +1,18 @@ #include "Loggers.h" #include -#include -#include -#include "OwnFormattingChannel.h" -#include "OwnPatternFormatter.h" -#include "OwnSplitChannel.h" #include #include #include +#include +#include +#include "OwnFormattingChannel.h" +#include "OwnJSONPatternFormatter.h" +#include "OwnPatternFormatter.h" +#include "OwnSplitChannel.h" #ifdef WITH_TEXT_LOG - #include +# include #endif #include @@ -20,10 +21,9 @@ namespace fs = std::filesystem; namespace DB { - class SensitiveDataMasker; +class SensitiveDataMasker; } - // TODO: move to libcommon static std::string createDirectory(const std::string & file) { @@ -49,7 +49,6 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log if (auto log = text_log.lock()) split->addTextLog(log, text_log_max_priority); #endif - auto current_logger = config.getString("logger", ""); if (config_logger == current_logger) //-V1051 return; @@ -97,11 +96,22 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log log_file->setProperty(Poco::FileChannel::PROP_ROTATEONOPEN, config.getRawString("logger.rotateOnOpen", "false")); log_file->open(); - Poco::AutoPtr pf = new OwnPatternFormatter; + if (config.has("logger.json")) + { + Poco::AutoPtr pf = new OwnJSONPatternFormatter; - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, log_file); - log->setLevel(log_level); - split->addChannel(log, "log"); + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, log_file); + log->setLevel(log_level); + split->addChannel(log, "log"); + } + else + { + Poco::AutoPtr pf = new OwnPatternFormatter; + + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, log_file); + log->setLevel(log_level); + split->addChannel(log, "log"); + } } const auto errorlog_path = config.getString("logger.errorlog", ""); @@ -133,12 +143,24 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log error_log_file->setProperty(Poco::FileChannel::PROP_FLUSH, config.getRawString("logger.flush", "true")); error_log_file->setProperty(Poco::FileChannel::PROP_ROTATEONOPEN, config.getRawString("logger.rotateOnOpen", "false")); - Poco::AutoPtr pf = new OwnPatternFormatter; + if (config.has("logger.json")) + { + Poco::AutoPtr pf = new OwnJSONPatternFormatter; - Poco::AutoPtr errorlog = new DB::OwnFormattingChannel(pf, error_log_file); - errorlog->setLevel(errorlog_level); - errorlog->open(); - split->addChannel(errorlog, "errorlog"); + Poco::AutoPtr errorlog = new DB::OwnFormattingChannel(pf, error_log_file); + errorlog->setLevel(errorlog_level); + errorlog->open(); + split->addChannel(errorlog, "errorlog"); + } + else + { + Poco::AutoPtr pf = new OwnPatternFormatter; + + Poco::AutoPtr errorlog = new DB::OwnFormattingChannel(pf, error_log_file); + errorlog->setLevel(errorlog_level); + errorlog->open(); + split->addChannel(errorlog, "errorlog"); + } } if (config.getBool("logger.use_syslog", false)) @@ -172,19 +194,29 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log } syslog_channel->open(); - Poco::AutoPtr pf = new OwnPatternFormatter; + if (config.has("logger.json")) + { + Poco::AutoPtr pf = new OwnJSONPatternFormatter; - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, syslog_channel); - log->setLevel(syslog_level); + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, syslog_channel); + log->setLevel(syslog_level); - split->addChannel(log, "syslog"); + split->addChannel(log, "syslog"); + } + else + { + Poco::AutoPtr pf = new OwnPatternFormatter; + + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, syslog_channel); + log->setLevel(syslog_level); + + split->addChannel(log, "syslog"); + } } bool should_log_to_console = isatty(STDIN_FILENO) || isatty(STDERR_FILENO); bool color_logs_by_default = isatty(STDERR_FILENO); - - if (config.getBool("logger.console", false) - || (!config.hasProperty("logger.console") && !is_daemon && should_log_to_console)) + if (config.getBool("logger.console", false) || (!config.hasProperty("logger.console") && !is_daemon && should_log_to_console)) { bool color_enabled = config.getBool("logger.color_terminal", color_logs_by_default); @@ -194,13 +226,23 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log { max_log_level = console_log_level; } - - Poco::AutoPtr pf = new OwnPatternFormatter(color_enabled); - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel); - log->setLevel(console_log_level); - split->addChannel(log, "console"); + if (config.has("logger.json")) + { + Poco::AutoPtr pf = new OwnJSONPatternFormatter(); + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel); + log->setLevel(console_log_level); + split->addChannel(log, "console"); + } + else + { + Poco::AutoPtr pf = new OwnPatternFormatter(color_enabled); + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel); + log->setLevel(console_log_level); + split->addChannel(log, "console"); + } } + split->open(); logger.close(); logger.setChannel(split); @@ -260,8 +302,7 @@ void Loggers::updateLevels(Poco::Util::AbstractConfiguration & config, Poco::Log // Set level to console bool is_daemon = config.getBool("application.runAsDaemon", false); bool should_log_to_console = isatty(STDIN_FILENO) || isatty(STDERR_FILENO); - if (config.getBool("logger.console", false) - || (!config.hasProperty("logger.console") && !is_daemon && should_log_to_console)) + if (config.getBool("logger.console", false) || (!config.hasProperty("logger.console") && !is_daemon && should_log_to_console)) split->setLevel("console", log_level); else split->setLevel("console", 0); diff --git a/src/Loggers/OwnFormattingChannel.cpp b/src/Loggers/OwnFormattingChannel.cpp index f03d155bde7..35f035d44ce 100644 --- a/src/Loggers/OwnFormattingChannel.cpp +++ b/src/Loggers/OwnFormattingChannel.cpp @@ -1,15 +1,19 @@ #include "OwnFormattingChannel.h" +#include "OwnJSONPatternFormatter.h" #include "OwnPatternFormatter.h" - - namespace DB { - void OwnFormattingChannel::logExtended(const ExtendedLogMessage & msg) { if (pChannel && priority >= msg.base.getPriority()) { - if (pFormatter) + if (pFormatterJSON) + { + std::string text; + pFormatterJSON->formatExtendedJSON(msg, text); + pChannel->log(Poco::Message(msg.base, text)); + } + else if (pFormatter) { std::string text; pFormatter->formatExtended(msg, text); diff --git a/src/Loggers/OwnFormattingChannel.h b/src/Loggers/OwnFormattingChannel.h index 0480d0d5061..12e8b24192d 100644 --- a/src/Loggers/OwnFormattingChannel.h +++ b/src/Loggers/OwnFormattingChannel.h @@ -4,6 +4,7 @@ #include #include #include "ExtendedLogChannel.h" +#include "OwnJSONPatternFormatter.h" #include "OwnPatternFormatter.h" @@ -19,6 +20,12 @@ public: { } + explicit OwnFormattingChannel( + Poco::AutoPtr pFormatterJSON_ = nullptr, Poco::AutoPtr pChannel_ = nullptr) + : pFormatterJSON(std::move(pFormatterJSON_)), pChannel(std::move(pChannel_)), priority(Poco::Message::PRIO_TRACE) + { + } + void setChannel(Poco::AutoPtr pChannel_) { pChannel = std::move(pChannel_); } void setLevel(Poco::Message::Priority priority_) { priority = priority_; } @@ -45,6 +52,7 @@ public: private: Poco::AutoPtr pFormatter; + Poco::AutoPtr pFormatterJSON; Poco::AutoPtr pChannel; std::atomic priority; }; diff --git a/src/Loggers/OwnJSONPatternFormatter.cpp b/src/Loggers/OwnJSONPatternFormatter.cpp new file mode 100644 index 00000000000..e9132ef0f0b --- /dev/null +++ b/src/Loggers/OwnJSONPatternFormatter.cpp @@ -0,0 +1,123 @@ +#include "OwnJSONPatternFormatter.h" + +#include +#include +#include +#include +#include +#include +#include + +OwnJSONPatternFormatter::OwnJSONPatternFormatter() : Poco::PatternFormatter("") +{ +} + + +void OwnJSONPatternFormatter::formatExtendedJSON(const DB::ExtendedLogMessage & msg_ext, std::string & text) const +{ + DB::WriteBufferFromString wb(text); + + DB::FormatSettings settings; + String key_name; + + const Poco::Message & msg = msg_ext.base; + DB::writeChar('{', wb); + + key_name = "date_time"; + writeJSONString(StringRef(key_name), wb, settings); + DB::writeChar(':', wb); + + DB::writeChar('\"', wb); + /// Change delimiters in date for compatibility with old logs. + writeDateTimeUnixTimestamp(msg_ext.time_seconds, 0, wb); + DB::writeChar('.', wb); + DB::writeChar('0' + ((msg_ext.time_microseconds / 100000) % 10), wb); + DB::writeChar('0' + ((msg_ext.time_microseconds / 10000) % 10), wb); + DB::writeChar('0' + ((msg_ext.time_microseconds / 1000) % 10), wb); + DB::writeChar('0' + ((msg_ext.time_microseconds / 100) % 10), wb); + DB::writeChar('0' + ((msg_ext.time_microseconds / 10) % 10), wb); + DB::writeChar('0' + ((msg_ext.time_microseconds / 1) % 10), wb); + DB::writeChar('\"', wb); + + DB::writeChar(',', wb); + + key_name = "thread_name"; + writeJSONString(StringRef(key_name), wb, settings); + DB::writeChar(':', wb); + writeJSONString(StringRef(msg.getThread()), wb, settings); + + DB::writeChar(',', wb); + + key_name = "thread_id"; + writeJSONString(StringRef(key_name), wb, settings); + DB::writeChar(':', wb); + DB::writeChar('\"', wb); + DB::writeIntText(msg_ext.thread_id, wb); + DB::writeChar('\"', wb); + + DB::writeChar(',', wb); + + key_name = "level"; + writeJSONString(StringRef(key_name), wb, settings); + DB::writeChar(':', wb); + int priority = static_cast(msg.getPriority()); + writeJSONString(StringRef(getPriorityName(priority)), wb, settings); + + DB::writeChar(',', wb); + + /// We write query_id even in case when it is empty (no query context) + /// just to be convenient for various log parsers. + + key_name = "query_id"; + writeJSONString(StringRef(key_name), wb, settings); + DB::writeChar(':', wb); + writeJSONString(msg_ext.query_id, wb, settings); + + DB::writeChar(',', wb); + + key_name = "logger_name"; + writeJSONString(StringRef(key_name), wb, settings); + DB::writeChar(':', wb); + + writeJSONString(StringRef(msg.getSource()), wb, settings); + + DB::writeChar(',', wb); + + key_name = "message"; + writeJSONString(StringRef(key_name), wb, settings); + DB::writeChar(':', wb); + String msg_text = msg.getText(); + writeJSONString(StringRef(msg_text), wb, settings); + + DB::writeChar(',', wb); + + key_name = "source_file"; + writeJSONString(StringRef(key_name), wb, settings); + DB::writeChar(':', wb); + const char * source_file = msg.getSourceFile(); + if (source_file != nullptr) + { + writeJSONString(StringRef(source_file), wb, settings); + } + + else + { + writeJSONString(StringRef(""), wb, settings); + } + + DB::writeChar(',', wb); + + key_name = "source_line"; + writeJSONString(StringRef(key_name), wb, settings); + DB::writeChar(':', wb); + DB::writeChar('\"', wb); + DB::writeIntText(msg.getSourceLine(), wb); + DB::writeChar('\"', wb); + + DB::writeChar('}', wb); +} + +void OwnJSONPatternFormatter::format(const Poco::Message & msg, std::string & text) +{ + formatExtendedJSON(DB::ExtendedLogMessage::getFrom(msg), text); +} diff --git a/src/Loggers/OwnJSONPatternFormatter.h b/src/Loggers/OwnJSONPatternFormatter.h new file mode 100644 index 00000000000..54e49a6391d --- /dev/null +++ b/src/Loggers/OwnJSONPatternFormatter.h @@ -0,0 +1,31 @@ +#pragma once + + +#include +#include "ExtendedLogChannel.h" + + +/** Format log messages own way in JSON. + * We can't obtain some details using Poco::PatternFormatter. + * + * Firstly, the thread number here is peaked not from Poco::Thread + * threads only, but from all threads with number assigned (see ThreadNumber.h) + * + * Secondly, the local date and time are correctly displayed. + * Poco::PatternFormatter does not work well with local time, + * when timestamps are close to DST timeshift moments. + * - see Poco sources and http://thread.gmane.org/gmane.comp.time.tz/8883 + * + * Also it's made a bit more efficient (unimportant). + */ + +class Loggers; + +class OwnJSONPatternFormatter : public Poco::PatternFormatter +{ +public: + OwnJSONPatternFormatter(); + + void format(const Poco::Message & msg, std::string & text) override; + void formatExtendedJSON(const DB::ExtendedLogMessage & msg_ext, std::string & text) const; +}; diff --git a/tests/integration/test_structured_logging_json/__init__.py b/tests/integration/test_structured_logging_json/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_structured_logging_json/test.py b/tests/integration/test_structured_logging_json/test.py new file mode 100644 index 00000000000..34507a605c6 --- /dev/null +++ b/tests/integration/test_structured_logging_json/test.py @@ -0,0 +1,57 @@ +import pytest +from helpers.cluster import ClickHouseCluster +import logging +import json +from xml.etree import ElementTree + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance("node", stay_alive=True) + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def get_log_array(logs): + log_array = [] + temp_log = "" + for i in range(0, len(logs)): + temp_log += logs[i] + if logs[i] == "}": + log_array.append(temp_log) + temp_log = "" + return log_array + + +def is_json(log_json): + try: + json.loads(log_json) + except ValueError as e: + return False + return True + + +def test_structured_logging_json_format(start_cluster): + config = node.exec_in_container(["cat", "/etc/clickhouse-server/config.xml"]) + root = ElementTree.fromstring(config) + for logger in root.findall("logger"): + if logger.find("json") is None: + pytest.skip("JSON is not activated in config.xml") + + node.query("SELECT 1") + + logs = node.grep_in_log(" ") + log_array = get_log_array(logs) + result = True + for i in range(0, len(log_array)): + temporary_result = is_json(log_array[i]) + result &= temporary_result + # we will test maximum 5 logs + if i >= min(4, len(log_array) - 1): + break + assert result == True From 25bafe5585a3bebdaccf450eb6c3712ad4f72c3b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 15 Jul 2022 21:06:04 +0200 Subject: [PATCH 009/164] Fix test 02325_dates_schema_inference --- tests/queries/0_stateless/02325_dates_schema_inference.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02325_dates_schema_inference.sql b/tests/queries/0_stateless/02325_dates_schema_inference.sql index 4527d4d32f5..3534a0eb48f 100644 --- a/tests/queries/0_stateless/02325_dates_schema_inference.sql +++ b/tests/queries/0_stateless/02325_dates_schema_inference.sql @@ -1,3 +1,5 @@ +-- Tags: no-fasttest + set input_format_try_infer_dates=1; set input_format_try_infer_datetimes=1; From b5bbf45ba72b7a4770839693c9f9b3111ac29306 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 15 Jul 2022 21:06:43 +0200 Subject: [PATCH 010/164] Fix test 02326_numbers_from_json_strings_schema_inference --- .../02326_numbers_from_json_strings_schema_inference.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.sql b/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.sql index d94e9b2dc23..2012a53c09d 100644 --- a/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.sql +++ b/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.sql @@ -1,3 +1,5 @@ +-- Tags: no-fasttest + set input_format_json_try_infer_numbers_from_strings=1; desc format(JSONEachRow, '{"x" : "123"}'); From ddcb8aece8d8b505b73439fccee51d28653dbc9f Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 15 Jul 2022 21:07:10 +0200 Subject: [PATCH 011/164] Fix test 02327_try_infer_integers_schema_inference --- .../0_stateless/02327_try_infer_integers_schema_inference.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql b/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql index 6dc94a643a2..0ceed178865 100644 --- a/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql +++ b/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql @@ -1,3 +1,5 @@ +-- Tags: no-fasttest + set input_format_try_infer_integers=1; select 'JSONEachRow'; From af718f7e04013d108ebff72ec20c00f5e8b29174 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 15 Jul 2022 13:06:39 -0700 Subject: [PATCH 012/164] updated config.xml --- programs/server/config.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/server/config.xml b/programs/server/config.xml index f8e7fa9d8cf..466301eef2b 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -68,7 +68,7 @@ To enable JSON logging support, just uncomment tag. Having the tag will make it work. For better understanding/visibility, you can add "true" or "1". --> - true + From 7b28bd11c57308eedefdd502f1e1bc190272adfa Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 18 Jul 2022 15:39:10 +0200 Subject: [PATCH 013/164] Fix style --- src/DataTypes/transformTypesRecursively.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/DataTypes/transformTypesRecursively.cpp b/src/DataTypes/transformTypesRecursively.cpp index 2f1b689a233..48e9dc60c19 100644 --- a/src/DataTypes/transformTypesRecursively.cpp +++ b/src/DataTypes/transformTypesRecursively.cpp @@ -25,7 +25,7 @@ void transformTypesRecursively(DataTypes & types, std::function Date: Mon, 18 Jul 2022 15:39:53 +0200 Subject: [PATCH 014/164] Fix style --- src/Formats/JSONUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 63c06a8615d..ebf8ef7e783 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -362,7 +362,7 @@ namespace JSONUtils } }; - DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings) + DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings) { JSONCompactEachRowFieldsExtractor extractor; return determineColumnDataTypesFromJSONEachRowDataImpl(in, settings, json_strings, extractor); From 857290b586ab707c096651467bc1b4406ac07d78 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 18 Jul 2022 15:40:28 +0200 Subject: [PATCH 015/164] Fix style --- src/Formats/EscapingRuleUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 3aceaeeff1b..69684b67071 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -431,7 +431,7 @@ void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, c second = std::move(types[1]); } -void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set * numbers_parsed_from_json_strings) +void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set * numbers_parsed_from_json_strings) { transformInferredTypesIfNeededImpl(types, settings, true, numbers_parsed_from_json_strings); } From 81bbc3ef1a162a986766fa3aca3522ff062c0073 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 18 Jul 2022 16:10:25 +0200 Subject: [PATCH 016/164] Turn on new settings by default --- src/Core/Settings.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b0e7f554717..f700707d1a8 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -686,10 +686,10 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Arrow", 0) \ M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \ M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \ - M(Bool, input_format_json_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference", 0) \ - M(Bool, input_format_try_infer_integers, false, "Try to infer numbers from string fields while schema inference in text formats", 0) \ - M(Bool, input_format_try_infer_dates, false, "Try to infer dates from string fields while schema inference in text formats", 0) \ - M(Bool, input_format_try_infer_datetimes, false, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ + M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \ + M(Bool, input_format_try_infer_integers, true, "Try to infer numbers from string fields while schema inference in text formats", 0) \ + M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ + M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \ M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \ M(UInt64, input_format_csv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in CSV format", 0) \ From 28fd774df8c90e2e440e0cd8d6fe5aa653e121e1 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Mon, 18 Jul 2022 16:44:14 +0000 Subject: [PATCH 017/164] Correct 01821_to_date_time_ubsan and 01921_datatype_date32 tests --- src/Common/DateLUTImpl.h | 6 +- .../01821_to_date_time_ubsan.reference | 4 +- .../01921_datatype_date32.reference | 334 +++++++++--------- .../0_stateless/01921_datatype_date32.sql | 11 +- 4 files changed, 177 insertions(+), 178 deletions(-) diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index 98df35b2dfa..a5a2b491a61 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -71,7 +71,7 @@ private: static inline LUTIndex normalizeLUTIndex(UInt32 index) { if (index >= DATE_LUT_SIZE) - LUTIndex(DATE_LUT_SIZE - 1); + return LUTIndex(DATE_LUT_SIZE - 1); return LUTIndex{index}; } @@ -80,7 +80,7 @@ private: if unlikely(index < 0 ) return LUTIndex(0); if (index >= DATE_LUT_SIZE) - LUTIndex(DATE_LUT_SIZE - 1); + return LUTIndex(DATE_LUT_SIZE - 1); return LUTIndex{index}; } @@ -93,7 +93,7 @@ private: template friend inline LUTIndex operator+(const T v, const LUTIndex & index) { - return normalizeLUTIndex(v + index.toUnderType()); + return normalizeLUTIndex(static_cast(v + index.toUnderType())); } friend inline LUTIndex operator+(const LUTIndex & index, const LUTIndex & v) diff --git a/tests/queries/0_stateless/01821_to_date_time_ubsan.reference b/tests/queries/0_stateless/01821_to_date_time_ubsan.reference index 0a762ec3b77..28c4987125c 100644 --- a/tests/queries/0_stateless/01821_to_date_time_ubsan.reference +++ b/tests/queries/0_stateless/01821_to_date_time_ubsan.reference @@ -1,2 +1,2 @@ -2283-11-11 23:48:05.4775806 -2283-11-11 23:52:48.54775806 +2299-12-31 23:48:05.4775806 +2299-12-31 23:52:48.54775806 diff --git a/tests/queries/0_stateless/01921_datatype_date32.reference b/tests/queries/0_stateless/01921_datatype_date32.reference index 70eebc76c01..acb0cc4ca59 100644 --- a/tests/queries/0_stateless/01921_datatype_date32.reference +++ b/tests/queries/0_stateless/01921_datatype_date32.reference @@ -1,19 +1,19 @@ -1925-01-01 -1925-01-01 -2282-12-31 -2283-11-11 +1900-01-01 +1900-01-01 +2299-12-15 +2299-12-31 2021-06-22 -------toYear--------- -1925 -1925 -2282 -2283 +1900 +1900 +2299 +2299 2021 -------toMonth--------- 1 1 12 -11 +12 6 -------toQuarter--------- 1 @@ -24,83 +24,83 @@ -------toDayOfMonth--------- 1 1 +15 31 -11 22 -------toDayOfWeek--------- -4 -4 -7 +1 +1 +5 7 2 -------toDayOfYear--------- 1 1 -365 -315 +349 +364 173 -------toHour--------- -------toMinute--------- -------toSecond--------- -------toStartOfDay--------- -2061-02-06 07:28:16 -2061-02-06 07:28:16 -2010-10-17 11:03:28 -2011-08-28 11:03:28 +2036-02-07 07:31:20 +2036-02-07 07:31:20 +2027-10-01 11:03:28 +2027-10-17 11:03:28 2021-06-22 00:00:00 -------toMonday--------- -2104-06-04 -2104-06-04 -2103-07-21 -2104-05-31 +2079-06-07 +2079-06-07 +2120-07-06 +2120-07-20 2021-06-21 -------toISOWeek--------- 1 1 +50 52 -45 25 -------toISOYear--------- -1925 -1925 -2282 -2283 +1900 +1900 +2299 +2299 2021 -------toWeek--------- 0 0 +50 53 -45 25 -------toYearWeek--------- -192452 -192452 -228253 -228345 +189953 +189953 +229950 +229953 202125 -------toStartOfWeek--------- -2104-06-03 -2104-06-03 -2103-07-27 -2104-06-06 +2079-06-06 +2079-06-06 +2120-07-05 +2120-07-26 2021-06-20 -------toStartOfMonth--------- -2104-06-07 -2104-06-07 -2103-06-27 -2104-05-27 +2079-06-07 +2079-06-07 +2120-06-26 +2120-06-26 2021-06-01 -------toStartOfQuarter--------- -2104-06-07 -2104-06-07 -2103-04-27 -2104-04-26 +2079-06-07 +2079-06-07 +2120-04-26 +2120-04-26 2021-04-01 -------toStartOfYear--------- -2104-06-07 -2104-06-07 -2102-07-28 -2103-07-28 +2079-06-07 +2079-06-07 +2119-07-28 +2119-07-28 2021-01-01 -------toStartOfSecond--------- -------toStartOfMinute--------- @@ -109,183 +109,183 @@ -------toStartOfFifteenMinutes--------- -------toStartOfHour--------- -------toStartOfISOYear--------- -2104-06-04 -2104-06-04 -2102-07-29 -2103-07-28 +2079-06-07 +2079-06-07 +2119-07-29 +2119-07-29 2021-01-04 -------toRelativeYearNum--------- -1925 -1925 -2282 -2283 +1900 +1900 +2299 +2299 2021 -------toRelativeQuarterNum--------- -7700 -7700 -9131 -9135 +7600 +7600 +9199 +9199 8085 -------toRelativeMonthNum--------- -23101 -23101 -27396 -27407 +22801 +22801 +27600 +27600 24258 -------toRelativeWeekNum--------- -63189 -63189 -16331 -63188 +61885 +61885 +17216 +17217 2686 -------toRelativeDayNum--------- -49100 -49100 -48784 -49099 +39969 +39969 +54977 +54993 18800 -------toRelativeHourNum--------- -4294572852 -4294572852 -2743677 -2751237 +4294353708 +4294353708 +2892309 +2892693 451197 -------toRelativeMinuteNum--------- -4271299336 -4271299336 -164620620 -165074220 +4258150699 +4258150699 +173538540 +173561580 27071820 -------toRelativeSecondNum--------- -2874889696 -2874889696 -1287302608 -1314518608 +2085971480 +2085971480 +1822377808 +1823760208 1624309200 -------toTime--------- -------toYYYYMM--------- -192501 -192501 -228212 -228311 +190001 +190001 +229912 +229912 202106 -------toYYYYMMDD--------- -19250101 -19250101 -22821231 -22831111 +19000101 +19000101 +22991215 +22991231 20210622 -------toYYYYMMDDhhmmss--------- -19250101000000 -19250101000000 -22821231000000 -22831111000000 +19000101000000 +19000101000000 +22991215000000 +22991231000000 20210622000000 -------addSeconds--------- -1925-01-01 01:00:00.000 -1925-01-01 01:00:00.000 -2282-12-31 01:00:00.000 -2283-11-11 01:00:00.000 +1900-01-01 01:00:00.000 +1900-01-01 01:00:00.000 +2299-12-15 01:00:00.000 +2299-12-31 01:00:00.000 2021-06-22 01:00:00.000 -------addMinutes--------- -1925-01-01 01:00:00.000 -1925-01-01 01:00:00.000 -2282-12-31 01:00:00.000 -2283-11-11 01:00:00.000 +1900-01-01 01:00:00.000 +1900-01-01 01:00:00.000 +2299-12-15 01:00:00.000 +2299-12-31 01:00:00.000 2021-06-22 01:00:00.000 -------addHours--------- -1925-01-01 01:00:00.000 -1925-01-01 01:00:00.000 -2282-12-31 01:00:00.000 -2283-11-11 01:00:00.000 +1900-01-01 01:00:00.000 +1900-01-01 01:00:00.000 +2299-12-15 01:00:00.000 +2299-12-31 01:00:00.000 2021-06-22 01:00:00.000 -------addDays--------- -1925-01-08 -1925-01-08 -2283-01-07 -1925-01-07 +1900-01-08 +1900-01-08 +2299-12-22 +2299-12-31 2021-06-29 -------addWeeks--------- -1925-01-08 -1925-01-08 -2283-01-07 -1925-01-07 +1900-01-08 +1900-01-08 +2299-12-22 +2299-12-31 2021-06-29 -------addMonths--------- -1925-02-01 -1925-02-01 -2283-01-31 -2283-11-11 +1900-02-01 +1900-02-01 +2299-12-31 +2299-12-31 2021-07-22 -------addQuarters--------- -1925-04-01 -1925-04-01 -2283-03-31 -2283-11-11 +1900-04-01 +1900-04-01 +2299-12-31 +2299-12-31 2021-09-22 -------addYears--------- -1926-01-01 -1926-01-01 -2283-11-11 -2283-11-11 +1901-01-01 +1901-01-01 +2299-12-31 +2299-12-31 2022-06-22 -------subtractSeconds--------- -1925-01-01 00:00:00.000 -1925-01-01 00:00:00.000 -2282-12-30 23:00:00.000 -2283-11-10 23:00:00.000 +1900-01-01 00:00:00.000 +1900-01-01 00:00:00.000 +2299-12-14 23:00:00.000 +2299-12-30 23:00:00.000 2021-06-21 23:00:00.000 -------subtractMinutes--------- -1925-01-01 00:00:00.000 -1925-01-01 00:00:00.000 -2282-12-30 23:00:00.000 -2283-11-10 23:00:00.000 +1900-01-01 00:00:00.000 +1900-01-01 00:00:00.000 +2299-12-14 23:00:00.000 +2299-12-30 23:00:00.000 2021-06-21 23:00:00.000 -------subtractHours--------- -1925-01-01 00:00:00.000 -1925-01-01 00:00:00.000 -2282-12-30 23:00:00.000 -2283-11-10 23:00:00.000 +1900-01-01 00:00:00.000 +1900-01-01 00:00:00.000 +2299-12-14 23:00:00.000 +2299-12-30 23:00:00.000 2021-06-21 23:00:00.000 -------subtractDays--------- -2283-11-05 -2283-11-05 -2282-12-24 -2283-11-04 +2299-12-31 +2299-12-31 +2299-12-08 +2299-12-24 2021-06-15 -------subtractWeeks--------- -2283-11-05 -2283-11-05 -2282-12-24 -2283-11-04 +2299-12-31 +2299-12-31 +2299-12-08 +2299-12-24 2021-06-15 -------subtractMonths--------- -1925-01-01 -1925-01-01 -2282-11-30 -2283-10-11 +1900-01-01 +1900-01-01 +2299-11-15 +2299-11-30 2021-05-22 -------subtractQuarters--------- -1925-01-01 -1925-01-01 -2282-09-30 -2283-08-11 +1900-01-01 +1900-01-01 +2299-09-15 +2299-09-30 2021-03-22 -------subtractYears--------- -1925-01-01 -1925-01-01 -2281-12-31 -2282-11-11 +1900-01-01 +1900-01-01 +2298-12-15 +2298-12-31 2020-06-22 -------toDate32--------- -1925-01-01 2000-01-01 -1925-01-01 1925-01-01 -1925-01-01 \N -1925-01-01 +1900-01-01 2000-01-01 +1900-01-01 1900-01-01 +1900-01-01 \N +1900-01-01 \N -1925-01-01 +1900-01-01 1969-12-31 1970-01-01 2149-06-06 2149-06-07 -2283-11-11 +2299-12-31 diff --git a/tests/queries/0_stateless/01921_datatype_date32.sql b/tests/queries/0_stateless/01921_datatype_date32.sql index ef6e3e5ee89..8b65f82825f 100644 --- a/tests/queries/0_stateless/01921_datatype_date32.sql +++ b/tests/queries/0_stateless/01921_datatype_date32.sql @@ -1,7 +1,7 @@ drop table if exists t1; create table t1(x1 Date32) engine Memory; -insert into t1 values ('1925-01-01'),('1924-01-01'),('2282-12-31'),('2283-12-31'),('2021-06-22'); +insert into t1 values ('1900-01-01'),('1899-01-01'),('2299-12-15'),('2300-12-31'),('2021-06-22'); select x1 from t1; select '-------toYear---------'; @@ -113,20 +113,19 @@ select subtractQuarters(x1, 1) from t1; select '-------subtractYears---------'; select subtractYears(x1, 1) from t1; select '-------toDate32---------'; -select toDate32('1925-01-01'), toDate32(toDate('2000-01-01')); -select toDate32OrZero('1924-01-01'), toDate32OrNull('1924-01-01'); +select toDate32('1900-01-01'), toDate32(toDate('2000-01-01')); +select toDate32OrZero('1899-01-01'), toDate32OrNull('1899-01-01'); select toDate32OrZero(''), toDate32OrNull(''); select (select toDate32OrZero('')); select (select toDate32OrNull('')); SELECT toString(T.d) dateStr FROM ( - SELECT '1925-01-01'::Date32 d + SELECT '1900-01-01'::Date32 d UNION ALL SELECT '1969-12-31'::Date32 UNION ALL SELECT '1970-01-01'::Date32 UNION ALL SELECT '2149-06-06'::Date32 UNION ALL SELECT '2149-06-07'::Date32 - UNION ALL SELECT '2283-11-11'::Date32 + UNION ALL SELECT '2299-12-31'::Date32 ) AS T ORDER BY T.d - From 77c66666b323d6aff0c24f3cf288d457f9654953 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 18 Jul 2022 20:21:21 +0000 Subject: [PATCH 018/164] Track mmap. --- .../glibc-compatibility/glibc-compatibility.c | 70 ++++++++++++++++++- src/Interpreters/AsynchronousMetrics.cpp | 3 + 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/base/glibc-compatibility/glibc-compatibility.c b/base/glibc-compatibility/glibc-compatibility.c index e3f62b7948a..6885051ca28 100644 --- a/base/glibc-compatibility/glibc-compatibility.c +++ b/base/glibc-compatibility/glibc-compatibility.c @@ -12,6 +12,8 @@ extern "C" { #include #include #include +#include +#include long int syscall(long int __sysno, ...) __THROW; @@ -100,7 +102,7 @@ int __dprintf_chk (int d, int unused, const char *format, ...) return ret; } -size_t fread(void *ptr, size_t size, size_t nmemb, void *stream); +//size_t fread(void *ptr, size_t size, size_t nmemb, void *stream); size_t __fread_chk(void *ptr, size_t unused, size_t size, size_t nmemb, void *stream) { @@ -133,6 +135,72 @@ int __open_2(const char *path, int oflag) return open(path, oflag); } +#include + +atomic_int_fast64_t mmap_allocated_bytes_total = 0; + +static size_t alignToPageSize(size_t size) +{ + /// We don't need to be precise here. + static size_t page_size_mask = 4096 - 1; + return (size + page_size_mask) & (~page_size_mask); +} + +void * __mmap(void * addr, size_t length, int prot, int flags, int fd, off_t offset); + +void * mmap(void * addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + void * res = __mmap(addr, length, prot, flags, fd, offset); + //char is_executable_file = (flags & MAP_ANONYMOUS) == 0 && (flags & PROT_EXEC) != 0; + if (res != (void *) -1) + { + // size_t prev = atomic_load(&mmap_allocated_bytes_total) & 4095; + // fprintf( stderr, "++++++ %zu\n", prev); + atomic_fetch_add_explicit(&mmap_allocated_bytes_total, alignToPageSize(length), memory_order_relaxed); + } + return res; +} + +int __munmap(void * addr, size_t length); + +int munmap(void * addr, size_t length) +{ + int res = __munmap(addr, length); + if (res == 0) + { + + // size_t prev = atomic_load(&mmap_allocated_bytes_total) & 4095; + atomic_fetch_sub_explicit(&mmap_allocated_bytes_total, alignToPageSize(length), memory_order_relaxed); + // fprintf( stderr, "------ %zu\n", prev); + } + return res; +} + +void * mremap(void *old_addr, size_t old_len, size_t new_len, int flags, ...) +{ + va_list ap; + void *new_addr; + + va_start(ap, flags); + new_addr = va_arg(ap, void *); + va_end(ap); + + void * res = (void *)syscall(SYS_mremap, old_addr, old_len, new_len, flags, new_addr); + + if (res != (void *) -1) + { + // size_t prev = atomic_load(&mmap_allocated_bytes_total) & 4095; + // fprintf( stderr, "========= %zu\n", prev); + atomic_fetch_sub_explicit(&mmap_allocated_bytes_total, alignToPageSize(old_len), memory_order_relaxed); + atomic_fetch_add_explicit(&mmap_allocated_bytes_total, alignToPageSize(new_len), memory_order_relaxed); + } + return res; +} + +extern int_fast64_t getTotalMemoryMappedBytes() +{ + return atomic_load_explicit(&mmap_allocated_bytes_total, memory_order_relaxed); +} #include diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 9275c1d6840..911e5271217 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -35,6 +35,7 @@ namespace CurrentMetrics extern const Metric MemoryTracking; } +extern "C" int_fast64_t getTotalMemoryMappedBytes(); namespace DB { @@ -651,6 +652,8 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } } + new_values["MemoryMapped"] = getTotalMemoryMappedBytes(); + /// Process process memory usage according to OS #if defined(OS_LINUX) || defined(OS_FREEBSD) { From 7ca666381205de39c5a1e74b2e390ac76d207d3f Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 18 Jul 2022 20:21:37 +0000 Subject: [PATCH 019/164] Revert "Track mmap." This reverts commit 77c66666b323d6aff0c24f3cf288d457f9654953. --- .../glibc-compatibility/glibc-compatibility.c | 70 +------------------ src/Interpreters/AsynchronousMetrics.cpp | 3 - 2 files changed, 1 insertion(+), 72 deletions(-) diff --git a/base/glibc-compatibility/glibc-compatibility.c b/base/glibc-compatibility/glibc-compatibility.c index 6885051ca28..e3f62b7948a 100644 --- a/base/glibc-compatibility/glibc-compatibility.c +++ b/base/glibc-compatibility/glibc-compatibility.c @@ -12,8 +12,6 @@ extern "C" { #include #include #include -#include -#include long int syscall(long int __sysno, ...) __THROW; @@ -102,7 +100,7 @@ int __dprintf_chk (int d, int unused, const char *format, ...) return ret; } -//size_t fread(void *ptr, size_t size, size_t nmemb, void *stream); +size_t fread(void *ptr, size_t size, size_t nmemb, void *stream); size_t __fread_chk(void *ptr, size_t unused, size_t size, size_t nmemb, void *stream) { @@ -135,72 +133,6 @@ int __open_2(const char *path, int oflag) return open(path, oflag); } -#include - -atomic_int_fast64_t mmap_allocated_bytes_total = 0; - -static size_t alignToPageSize(size_t size) -{ - /// We don't need to be precise here. - static size_t page_size_mask = 4096 - 1; - return (size + page_size_mask) & (~page_size_mask); -} - -void * __mmap(void * addr, size_t length, int prot, int flags, int fd, off_t offset); - -void * mmap(void * addr, size_t length, int prot, int flags, int fd, off_t offset) -{ - void * res = __mmap(addr, length, prot, flags, fd, offset); - //char is_executable_file = (flags & MAP_ANONYMOUS) == 0 && (flags & PROT_EXEC) != 0; - if (res != (void *) -1) - { - // size_t prev = atomic_load(&mmap_allocated_bytes_total) & 4095; - // fprintf( stderr, "++++++ %zu\n", prev); - atomic_fetch_add_explicit(&mmap_allocated_bytes_total, alignToPageSize(length), memory_order_relaxed); - } - return res; -} - -int __munmap(void * addr, size_t length); - -int munmap(void * addr, size_t length) -{ - int res = __munmap(addr, length); - if (res == 0) - { - - // size_t prev = atomic_load(&mmap_allocated_bytes_total) & 4095; - atomic_fetch_sub_explicit(&mmap_allocated_bytes_total, alignToPageSize(length), memory_order_relaxed); - // fprintf( stderr, "------ %zu\n", prev); - } - return res; -} - -void * mremap(void *old_addr, size_t old_len, size_t new_len, int flags, ...) -{ - va_list ap; - void *new_addr; - - va_start(ap, flags); - new_addr = va_arg(ap, void *); - va_end(ap); - - void * res = (void *)syscall(SYS_mremap, old_addr, old_len, new_len, flags, new_addr); - - if (res != (void *) -1) - { - // size_t prev = atomic_load(&mmap_allocated_bytes_total) & 4095; - // fprintf( stderr, "========= %zu\n", prev); - atomic_fetch_sub_explicit(&mmap_allocated_bytes_total, alignToPageSize(old_len), memory_order_relaxed); - atomic_fetch_add_explicit(&mmap_allocated_bytes_total, alignToPageSize(new_len), memory_order_relaxed); - } - return res; -} - -extern int_fast64_t getTotalMemoryMappedBytes() -{ - return atomic_load_explicit(&mmap_allocated_bytes_total, memory_order_relaxed); -} #include diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 911e5271217..9275c1d6840 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -35,7 +35,6 @@ namespace CurrentMetrics extern const Metric MemoryTracking; } -extern "C" int_fast64_t getTotalMemoryMappedBytes(); namespace DB { @@ -652,8 +651,6 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } } - new_values["MemoryMapped"] = getTotalMemoryMappedBytes(); - /// Process process memory usage according to OS #if defined(OS_LINUX) || defined(OS_FREEBSD) { From 87e5b31598b44360fff26c2b900f996461426288 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 18 Jul 2022 21:16:16 +0000 Subject: [PATCH 020/164] Do not include memory buffered by allocator into drift. --- src/Common/MemoryTracker.cpp | 13 +++-- src/Interpreters/AsynchronousMetrics.cpp | 68 +++++++++++++----------- 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index ba097568477..23ae758ccdd 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -136,6 +136,7 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT { /// For global memory tracker always update memory usage. amount.fetch_add(size, std::memory_order_relaxed); + rss.fetch_add(size, std::memory_order_relaxed); auto metric_loaded = metric.load(std::memory_order_relaxed); if (metric_loaded != CurrentMetrics::end()) @@ -207,17 +208,21 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT allocation_traced = true; } + Int64 amount_to_check = will_be; bool used_rss_counter = false; + /// For Global memory tracker, additionally check RSS. + /// It is needed to avoid possible OOM. + /// We can't track all memory allocations from external libraries (yet). if (level == VariableContext::Global) { - if (Int64 current_rss = rss.load(std::memory_order_relaxed); unlikely(current_rss + size > will_be)) + if (Int64 current_rss = size + rss.fetch_add(size, std::memory_order_relaxed); unlikely(current_rss > will_be)) { used_rss_counter = true; - will_be = current_rss + size; + amount_to_check = current_rss; } } - if (unlikely(current_hard_limit && will_be > current_hard_limit) && memoryTrackerCanThrow(level, false) && throw_if_memory_exceeded) + if (unlikely(current_hard_limit && amount_to_check > current_hard_limit) && memoryTrackerCanThrow(level, false) && throw_if_memory_exceeded) { OvercommitResult overcommit_result = OvercommitResult::NONE; if (auto * overcommit_tracker_ptr = overcommit_tracker.load(std::memory_order_relaxed); overcommit_tracker_ptr != nullptr && query_tracker != nullptr) @@ -235,7 +240,7 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT description ? " " : "", description ? description : "", used_rss_counter ? "(RSS) " : "", - formatReadableSizeWithBinarySuffix(will_be), + formatReadableSizeWithBinarySuffix(amount_to_check), size, formatReadableSizeWithBinarySuffix(current_hard_limit), toDescription(overcommit_result)); diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 9275c1d6840..2dda8f5b39c 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -387,7 +387,7 @@ uint64_t updateJemallocEpoch() } template -static void saveJemallocMetricImpl(AsynchronousMetricValues & values, +static Value saveJemallocMetricImpl(AsynchronousMetricValues & values, const std::string & jemalloc_full_name, const std::string & clickhouse_full_name) { @@ -395,22 +395,23 @@ static void saveJemallocMetricImpl(AsynchronousMetricValues & values, size_t size = sizeof(value); mallctl(jemalloc_full_name.c_str(), &value, &size, nullptr, 0); values[clickhouse_full_name] = value; + return value; } template -static void saveJemallocMetric(AsynchronousMetricValues & values, +static Value saveJemallocMetric(AsynchronousMetricValues & values, const std::string & metric_name) { - saveJemallocMetricImpl(values, + return saveJemallocMetricImpl(values, fmt::format("stats.{}", metric_name), fmt::format("jemalloc.{}", metric_name)); } template -static void saveAllArenasMetric(AsynchronousMetricValues & values, +static Value saveAllArenasMetric(AsynchronousMetricValues & values, const std::string & metric_name) { - saveJemallocMetricImpl(values, + return saveJemallocMetricImpl(values, fmt::format("stats.arenas.{}.{}", MALLCTL_ARENAS_ALL, metric_name), fmt::format("jemalloc.arenas.all.{}", metric_name)); } @@ -651,6 +652,31 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } } +#if USE_JEMALLOC + // 'epoch' is a special mallctl -- it updates the statistics. Without it, all + // the following calls will return stale values. It increments and returns + // the current epoch number, which might be useful to log as a sanity check. + auto epoch = updateJemallocEpoch(); + new_values["jemalloc.epoch"] = epoch; + + // Collect the statistics themselves. + size_t je_malloc_allocated = saveJemallocMetric(new_values, "allocated"); + saveJemallocMetric(new_values, "active"); + saveJemallocMetric(new_values, "metadata"); + saveJemallocMetric(new_values, "metadata_thp"); + saveJemallocMetric(new_values, "resident"); + size_t je_malloc_mapped = saveJemallocMetric(new_values, "mapped"); + saveJemallocMetric(new_values, "retained"); + saveJemallocMetric(new_values, "background_thread.num_threads"); + saveJemallocMetric(new_values, "background_thread.num_runs"); + saveJemallocMetric(new_values, "background_thread.run_intervals"); + saveAllArenasMetric(new_values, "pactive"); + saveAllArenasMetric(new_values, "pdirty"); + saveAllArenasMetric(new_values, "pmuzzy"); + saveAllArenasMetric(new_values, "dirty_purged"); + saveAllArenasMetric(new_values, "muzzy_purged"); +#endif + /// Process process memory usage according to OS #if defined(OS_LINUX) || defined(OS_FREEBSD) { @@ -672,6 +698,13 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti Int64 peak = total_memory_tracker.getPeak(); Int64 rss = data.resident; +#if USE_JEMALLOC + /// This is a memory which is kept by allocator. + /// Remove it from RSS to decrease memory drift. + rss -= je_malloc_mapped - je_malloc_allocated; +#endif + /// In theory, the difference between RSS and tracked memory should be caused by + /// external libraries which allocation we can't track. Int64 rss_drift = rss - amount; Int64 difference = rss_drift - last_logged_rss_drift; @@ -1470,31 +1503,6 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } } -#if USE_JEMALLOC - // 'epoch' is a special mallctl -- it updates the statistics. Without it, all - // the following calls will return stale values. It increments and returns - // the current epoch number, which might be useful to log as a sanity check. - auto epoch = updateJemallocEpoch(); - new_values["jemalloc.epoch"] = epoch; - - // Collect the statistics themselves. - saveJemallocMetric(new_values, "allocated"); - saveJemallocMetric(new_values, "active"); - saveJemallocMetric(new_values, "metadata"); - saveJemallocMetric(new_values, "metadata_thp"); - saveJemallocMetric(new_values, "resident"); - saveJemallocMetric(new_values, "mapped"); - saveJemallocMetric(new_values, "retained"); - saveJemallocMetric(new_values, "background_thread.num_threads"); - saveJemallocMetric(new_values, "background_thread.num_runs"); - saveJemallocMetric(new_values, "background_thread.run_intervals"); - saveAllArenasMetric(new_values, "pactive"); - saveAllArenasMetric(new_values, "pdirty"); - saveAllArenasMetric(new_values, "pmuzzy"); - saveAllArenasMetric(new_values, "dirty_purged"); - saveAllArenasMetric(new_values, "muzzy_purged"); -#endif - /// Add more metrics as you wish. new_values["AsynchronousMetricsCalculationTimeSpent"] = watch.elapsedSeconds(); From 5a3cb0771152897a5267f92bed155f9c1589e776 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 18 Jul 2022 21:24:41 +0000 Subject: [PATCH 021/164] Fix rss --- src/Common/MemoryTracker.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index 1e370d9d400..0d7d7003ae2 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -301,6 +301,7 @@ void MemoryTracker::free(Int64 size) { /// For global memory tracker always update memory usage. amount.fetch_sub(size, std::memory_order_relaxed); + rss.fetch_sub(size, std::memory_order_relaxed); auto metric_loaded = metric.load(std::memory_order_relaxed); if (metric_loaded != CurrentMetrics::end()) From fc6982f9e28ade1b7077be4381b5079f46244c19 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 19 Jul 2022 10:10:02 +0000 Subject: [PATCH 022/164] Fixing test. --- src/Common/MemoryTracker.cpp | 5 +++-- .../test.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index 0d7d7003ae2..ef5e0c45373 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -295,14 +295,15 @@ bool MemoryTracker::updatePeak(Int64 will_be, bool log_memory_usage) void MemoryTracker::free(Int64 size) { + if (level == VariableContext::Global) + rss.fetch_sub(size, std::memory_order_relaxed); + if (MemoryTrackerBlockerInThread::isBlocked(level)) { if (level == VariableContext::Global) { /// For global memory tracker always update memory usage. amount.fetch_sub(size, std::memory_order_relaxed); - rss.fetch_sub(size, std::memory_order_relaxed); - auto metric_loaded = metric.load(std::memory_order_relaxed); if (metric_loaded != CurrentMetrics::end()) CurrentMetrics::add(metric_loaded, size); diff --git a/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py b/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py index c95bbfda708..eba3aeff303 100644 --- a/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py +++ b/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py @@ -42,7 +42,8 @@ def test_memory_tracking_total(): "bash", "-c", "clickhouse local -q \"SELECT arrayStringConcat(arrayMap(x->toString(cityHash64(x)), range(1000)), ' ') from numbers(10000)\" > data.json", - ] + ], + user="root" ) for it in range(0, 20): From 5f9c293963de0a217c31abacdf143e4851a0149e Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 19 Jul 2022 17:29:08 +0000 Subject: [PATCH 023/164] Fix addDays() and addWeeks() in upper and lower limits of Date and Date32 --- src/Common/DateLUTImpl.h | 4 ---- .../FunctionDateOrDateTimeAddInterval.h | 18 ++++++++++++++---- .../01921_datatype_date32.reference | 8 ++++---- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index a5a2b491a61..d1b226e18b1 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -19,10 +19,7 @@ #define DATE_LUT_MAX (0xFFFFFFFFU - 86400) #define DATE_LUT_MAX_DAY_NUM 0xFFFF -#define DAYNUM_OFFSET_EPOCH 25567 - /// Max int value of Date32, DATE LUT cache size minus daynum_offset_epoch -// #define DATE_LUT_MAX_EXTEND_DAY_NUM (DATE_LUT_SIZE - (Time)DAYNUM_OFFSET_EPOCH) #define DATE_LUT_MAX_EXTEND_DAY_NUM (DATE_LUT_SIZE - 25567) /// A constant to add to time_t so every supported time point becomes non-negative and still has the same remainder of division by 3600. @@ -195,7 +192,6 @@ private: /// Offset to epoch in days (ExtendedDayNum) of the first day in LUT. /// "epoch" is the Unix Epoch (starts at unix timestamp zero) - // static constexpr UInt32 daynum_offset_epoch = DAYNUM_OFFSET_EPOCH; static constexpr UInt32 daynum_offset_epoch = 25567; static_assert(daynum_offset_epoch == (1970 - DATE_LUT_MIN_YEAR) * 365 + (1970 - DATE_LUT_MIN_YEAR / 4 * 4) / 4); diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h index fbfc9e9bc1f..10408093240 100644 --- a/src/Functions/FunctionDateOrDateTimeAddInterval.h +++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h @@ -288,12 +288,17 @@ struct AddDaysImpl static inline NO_SANITIZE_UNDEFINED UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl &, UInt16 = 0) { - return d + delta; + Int64 r = d + delta; + if (r < 0) + return 0; + if (r > 65535) + return 65535; + return static_cast(r); } static inline NO_SANITIZE_UNDEFINED Int32 execute(Int32 d, Int64 delta, const DateLUTImpl &, UInt16 = 0) { - return d + delta; + return std::max(static_cast(d + delta), -static_cast(DateLUT::instance().getDayNumOffsetEpoch())); } }; @@ -322,12 +327,17 @@ struct AddWeeksImpl static inline NO_SANITIZE_UNDEFINED UInt16 execute(UInt16 d, Int32 delta, const DateLUTImpl &, UInt16 = 0) { - return d + delta * 7; + Int64 r = d + delta * 7; + if (r < 0) + return 0; + if (r > 65535) + return 65535; + return static_cast(r); } static inline NO_SANITIZE_UNDEFINED Int32 execute(Int32 d, Int32 delta, const DateLUTImpl &, UInt16 = 0) { - return d + delta * 7; + return std::max(static_cast(d + delta * 7), -static_cast(DateLUT::instance().getDayNumOffsetEpoch())); } }; diff --git a/tests/queries/0_stateless/01921_datatype_date32.reference b/tests/queries/0_stateless/01921_datatype_date32.reference index acb0cc4ca59..8cc9cc2886f 100644 --- a/tests/queries/0_stateless/01921_datatype_date32.reference +++ b/tests/queries/0_stateless/01921_datatype_date32.reference @@ -248,14 +248,14 @@ 2299-12-30 23:00:00.000 2021-06-21 23:00:00.000 -------subtractDays--------- -2299-12-31 -2299-12-31 +1900-01-01 +1900-01-01 2299-12-08 2299-12-24 2021-06-15 -------subtractWeeks--------- -2299-12-31 -2299-12-31 +1900-01-01 +1900-01-01 2299-12-08 2299-12-24 2021-06-15 From afab965d7bf8691bd0f79e96ae138376b310d0d4 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 19 Jul 2022 12:38:47 -0700 Subject: [PATCH 024/164] style check --- src/Daemon/BaseDaemon.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index f9b627aaf79..74521a5fd4a 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -5,7 +5,6 @@ #include #include -#include #include #include #include From 877854b14332195fab91b778c667ee020fec19bd Mon Sep 17 00:00:00 2001 From: root Date: Wed, 20 Jul 2022 05:45:04 -0700 Subject: [PATCH 025/164] resolved build error saying - error: method 'formatExtendedJSON' can be made static --- src/Loggers/OwnJSONPatternFormatter.cpp | 2 +- src/Loggers/OwnJSONPatternFormatter.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Loggers/OwnJSONPatternFormatter.cpp b/src/Loggers/OwnJSONPatternFormatter.cpp index e9132ef0f0b..825cb1e9806 100644 --- a/src/Loggers/OwnJSONPatternFormatter.cpp +++ b/src/Loggers/OwnJSONPatternFormatter.cpp @@ -13,7 +13,7 @@ OwnJSONPatternFormatter::OwnJSONPatternFormatter() : Poco::PatternFormatter("") } -void OwnJSONPatternFormatter::formatExtendedJSON(const DB::ExtendedLogMessage & msg_ext, std::string & text) const +void OwnJSONPatternFormatter::formatExtendedJSON(const DB::ExtendedLogMessage & msg_ext, std::string & text) { DB::WriteBufferFromString wb(text); diff --git a/src/Loggers/OwnJSONPatternFormatter.h b/src/Loggers/OwnJSONPatternFormatter.h index 54e49a6391d..0523869aebb 100644 --- a/src/Loggers/OwnJSONPatternFormatter.h +++ b/src/Loggers/OwnJSONPatternFormatter.h @@ -27,5 +27,5 @@ public: OwnJSONPatternFormatter(); void format(const Poco::Message & msg, std::string & text) override; - void formatExtendedJSON(const DB::ExtendedLogMessage & msg_ext, std::string & text) const; + static void formatExtendedJSON(const DB::ExtendedLogMessage & msg_ext, std::string & text); }; From 9cbf1fd7fdd132849f6bd3c486f98e8d9da60fbb Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 20 Jul 2022 14:17:25 +0000 Subject: [PATCH 026/164] Fix tests --- ...01825_type_json_schema_inference.reference | 6 +- .../02149_schema_inference.reference | 90 +++++++++---------- .../02240_tskv_schema_inference_bug.reference | 4 +- .../02245_s3_schema_desc.reference | 24 ++--- ...csv_best_effort_schema_inference.reference | 14 +-- ...2305_schema_inference_with_globs.reference | 4 +- ...om_json_strings_schema_inference.reference | 16 ++-- 7 files changed, 79 insertions(+), 79 deletions(-) diff --git a/tests/queries/0_stateless/01825_type_json_schema_inference.reference b/tests/queries/0_stateless/01825_type_json_schema_inference.reference index a2089ea3366..22dfdee96a7 100644 --- a/tests/queries/0_stateless/01825_type_json_schema_inference.reference +++ b/tests/queries/0_stateless/01825_type_json_schema_inference.reference @@ -3,6 +3,6 @@ Tuple(k1 Nullable(Int8), k2 Tuple(k3 Nullable(String), k4 Nested(k5 Nullable(Int8), k6 Nullable(Int8))), some Nullable(Int8)) {"id":"1","obj":"aaa","s":"foo"} {"id":"2","obj":"bbb","s":"bar"} -{"map":{"k1":1,"k2":2},"obj":{"k1":1,"k2.k3":2},"map_type":"Map(String, Nullable(Float64))","obj_type":"Object(Nullable('json'))"} -{"obj":{"k1":1,"k2":2},"map":{"k1":"1","k2":"2"}} -Tuple(k1 Float64, k2 Float64) +{"map":{"k1":"1","k2":"2"},"obj":{"k1":1,"k2.k3":2},"map_type":"Map(String, Nullable(Int64))","obj_type":"Object(Nullable('json'))"} +{"obj":{"k1":"1","k2":"2"},"map":{"k1":"1","k2":"2"}} +Tuple(k1 Int64, k2 Int64) diff --git a/tests/queries/0_stateless/02149_schema_inference.reference b/tests/queries/0_stateless/02149_schema_inference.reference index 2d7dd5caca7..954e1813157 100644 --- a/tests/queries/0_stateless/02149_schema_inference.reference +++ b/tests/queries/0_stateless/02149_schema_inference.reference @@ -1,22 +1,22 @@ TSV -c1 Nullable(Float64) +c1 Nullable(Int64) c2 Nullable(String) -c3 Array(Nullable(Float64)) -c4 Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)) +c3 Array(Nullable(Int64)) +c4 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)) 42 Some string [1,2,3,4] (1,2,3) 42 abcd [] (4,5,6) TSVWithNames -number Nullable(Float64) +number Nullable(Int64) string Nullable(String) -array Array(Nullable(Float64)) -tuple Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)) +array Array(Nullable(Int64)) +tuple Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)) 42 Some string [1,2,3,4] (1,2,3) 42 abcd [] (4,5,6) CSV -c1 Nullable(Float64) +c1 Nullable(Int64) c2 Nullable(String) c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) -c4 Array(Nullable(Float64)) +c4 Array(Nullable(Int64)) \N Some string [([1,2.3],'String'),([],NULL)] [1,NULL,3] 42 \N [([1,2.3],'String'),([3],'abcd')] [4,5,6] c1 Nullable(String) @@ -28,54 +28,54 @@ c2 Nullable(String) \N [NULL, NULL] \N [] CSVWithNames -a Nullable(Float64) +a Nullable(Int64) b Nullable(String) c Array(Tuple(Array(Nullable(Float64)), Nullable(String))) -d Array(Nullable(Float64)) +d Array(Nullable(Int64)) \N Some string [([1,2.3],'String'),([],NULL)] [1,NULL,3] 42 \N [([1,2.3],'String'),([3],'abcd')] [4,5,6] JSONCompactEachRow c1 Nullable(Float64) -c2 Array(Tuple(Nullable(Float64), Nullable(String))) -c3 Map(String, Nullable(Float64)) +c2 Array(Tuple(Nullable(Int64), Nullable(String))) +c3 Map(String, Nullable(Int64)) c4 Nullable(Bool) 42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} true -c1 Nullable(Float64) -c2 Array(Tuple(Nullable(Float64), Nullable(String))) -c3 Map(String, Nullable(Float64)) +c1 Nullable(Int64) +c2 Array(Tuple(Nullable(Int64), Nullable(String))) +c3 Map(String, Nullable(Int64)) c4 Nullable(Bool) \N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N 32 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} true JSONCompactEachRowWithNames a Nullable(Float64) -b Array(Tuple(Nullable(Float64), Nullable(String))) -c Map(String, Nullable(Float64)) +b Array(Tuple(Nullable(Int64), Nullable(String))) +c Map(String, Nullable(Int64)) d Nullable(Bool) 42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} true JSONEachRow a Nullable(Float64) -b Array(Tuple(Nullable(Float64), Nullable(String))) -c Map(String, Nullable(Float64)) +b Array(Tuple(Nullable(Int64), Nullable(String))) +c Map(String, Nullable(Int64)) d Nullable(Bool) 42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} true -a Nullable(Float64) -b Array(Tuple(Nullable(Float64), Nullable(String))) -c Map(String, Nullable(Float64)) +a Nullable(Int64) +b Array(Tuple(Nullable(Int64), Nullable(String))) +c Map(String, Nullable(Int64)) d Nullable(Bool) \N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N 32 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} true -a Nullable(Float64) +a Nullable(Int64) b Nullable(String) -c Array(Nullable(Float64)) +c Array(Nullable(Int64)) 1 s1 [] 2 \N [2] \N \N [] \N \N [] \N \N [3] TSKV -a Nullable(Float64) +a Nullable(Int64) b Nullable(String) -c Array(Nullable(Float64)) +c Array(Nullable(Int64)) 1 s1 [] 2 } [2] \N \N [] @@ -84,77 +84,77 @@ c Array(Nullable(Float64)) Values c1 Nullable(Float64) c2 Nullable(String) -c3 Array(Nullable(Float64)) -c4 Tuple(Nullable(Float64), Nullable(String)) -c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(String)))) +c3 Array(Nullable(Int64)) +c4 Tuple(Nullable(Int64), Nullable(String)) +c5 Tuple(Array(Nullable(Int64)), Array(Tuple(Nullable(Int64), Nullable(String)))) 42.42 Some string [1,2,3] (1,'2') ([1,2],[(3,'4'),(5,'6')]) c1 Nullable(Float64) c2 Nullable(String) -c3 Array(Nullable(Float64)) -c4 Tuple(Nullable(Float64), Nullable(Float64)) -c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(String)))) +c3 Array(Nullable(Int64)) +c4 Tuple(Nullable(Int64), Nullable(Int64)) +c5 Tuple(Array(Nullable(Int64)), Array(Tuple(Nullable(Int64), Nullable(String)))) 42.42 \N [1,NULL,3] (1,NULL) ([1,2],[(3,'4'),(5,'6')]) \N Some string [10] (1,2) ([],[]) Regexp -c1 Nullable(Float64) +c1 Nullable(Int64) c2 Nullable(String) c3 Nullable(String) 42 Some string 1 [([1, 2, 3], String 1), ([], String 1)] 2 Some string 2 [([4, 5, 6], String 2), ([], String 2)] 312 Some string 3 [([1, 2, 3], String 2), ([], String 2)] -c1 Nullable(Float64) +c1 Nullable(Int64) c2 Nullable(String) -c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c3 Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 42 Some string 1 [([1,2,3],'String 1'),([],'String 1')] 3 Some string 2 [([3,5,1],'String 2'),([],'String 2')] 244 Some string 3 [([],'String 3'),([],'String 3')] -c1 Nullable(Float64) +c1 Nullable(Int64) c2 Nullable(String) -c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c3 Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 42 Some string 1 [([1,2,3],'String 1'),([],'String 1')] 2 Some string 2 [([],'String 2'),([],'String 2')] 43 Some string 3 [([1,5,3],'String 3'),([],'String 3')] -c1 Nullable(Float64) +c1 Nullable(Int64) c2 Nullable(String) -c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c3 Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] 52 Some string 2 [([],'String 2'),([1],'String 2')] 24 Some string 3 [([1,2,3],'String 3'),([1],'String 3')] CustomSeparated c1 Nullable(Float64) c2 Nullable(String) -c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c3 Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] 42 Some string 2 [([],'String 2'),([],'String 2')] \N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] c1 Nullable(Float64) c2 Nullable(String) -c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c3 Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] 42 Some string 2 [([],'String 2'),([],'String 2')] \N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] c1 Nullable(Float64) c2 Nullable(String) -c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c3 Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] 42 Some string 2 [([],'String 2'),([],'String 2')] \N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] Template column_1 Nullable(Float64) column_2 Nullable(String) -column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +column_3 Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] 42 Some string 2 [([],'String 2'),([],'String 2')] \N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] column_1 Nullable(Float64) column_2 Nullable(String) -column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +column_3 Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] 42 Some string 2 [([],'String 2'),([],'String 2')] \N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] column_1 Nullable(Float64) column_2 Nullable(String) -column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +column_3 Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] 42 Some string 2 [([],'String 2'),([],'String 2')] \N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] diff --git a/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference b/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference index d0ced74f8f6..0f8ac77ff74 100644 --- a/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference +++ b/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference @@ -1,6 +1,6 @@ -a Nullable(Float64) +a Nullable(Int64) b Nullable(String) -c Array(Nullable(Float64)) +c Array(Nullable(Int64)) 1 s1 [] 2 } [2] \N \N [] diff --git a/tests/queries/0_stateless/02245_s3_schema_desc.reference b/tests/queries/0_stateless/02245_s3_schema_desc.reference index e039680d933..d840a365310 100644 --- a/tests/queries/0_stateless/02245_s3_schema_desc.reference +++ b/tests/queries/0_stateless/02245_s3_schema_desc.reference @@ -1,21 +1,21 @@ -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) c1 UInt64 c2 UInt64 c3 UInt64 -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) c1 UInt64 c2 UInt64 c3 UInt64 -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) c1 UInt64 c2 UInt64 c3 UInt64 diff --git a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference index c245f13fdbe..1c60e40942c 100644 --- a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference +++ b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference @@ -1,8 +1,8 @@ TSV -c1 Nullable(Float64) +c1 Nullable(Int64) c2 Nullable(String) -c3 Array(Nullable(Float64)) -c4 Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)) +c3 Array(Nullable(Int64)) +c4 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)) 42 Some string [1,2,3,4] (1,2,3) 42 abcd [] (4,5,6) c1 Nullable(String) @@ -70,8 +70,8 @@ c1 Nullable(String) CSV c1 Nullable(String) c2 Nullable(String) -c3 Array(Nullable(Float64)) -c4 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))) +c3 Array(Nullable(Int64)) +c4 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) 42 Some string [1,2,3,4] [(1,2,3)] 42\\ abcd [] [(4,5,6)] c1 Nullable(String) @@ -101,7 +101,7 @@ c1 Nullable(String) (1, 2, 3) c1 Nullable(String) 123.123 -c1 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))) +c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) [(1,2,3)] -c1 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))) +c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) [(1,2,3)] diff --git a/tests/queries/0_stateless/02305_schema_inference_with_globs.reference b/tests/queries/0_stateless/02305_schema_inference_with_globs.reference index 9df5d2a264c..defa2133823 100644 --- a/tests/queries/0_stateless/02305_schema_inference_with_globs.reference +++ b/tests/queries/0_stateless/02305_schema_inference_with_globs.reference @@ -1,5 +1,5 @@ 2 4 6 -x Nullable(String) -x Nullable(String) +x Nullable(Int64) +x Nullable(Int64) diff --git a/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.reference b/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.reference index 2972dd92756..6da939d7839 100644 --- a/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.reference +++ b/tests/queries/0_stateless/02326_numbers_from_json_strings_schema_inference.reference @@ -1,11 +1,11 @@ -x Nullable(Float64) +x Nullable(Int64) x Array(Nullable(Float64)) -x Map(String, Nullable(Float64)) -x Map(String, Array(Nullable(Float64))) -x Nullable(Float64) -x Array(Nullable(Float64)) -x Map(String, Nullable(Float64)) -x Map(String, Array(Nullable(Float64))) +x Map(String, Nullable(Int64)) +x Map(String, Array(Nullable(Int64))) +x Nullable(Int64) +x Array(Nullable(Int64)) +x Map(String, Nullable(Int64)) +x Map(String, Array(Nullable(Int64))) x Array(Nullable(String)) x Map(String, Nullable(String)) x Map(String, Array(Nullable(String))) @@ -13,5 +13,5 @@ x Nullable(String) x Array(Nullable(String)) x Map(String, Nullable(String)) x Map(String, Array(Nullable(String))) -x Tuple(Nullable(Float64), Nullable(String)) +x Tuple(Nullable(Int64), Nullable(String)) x Object(Nullable(\'json\')) From e3192cf753794a2520c4461cb7c0cf7d6b719aa8 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 20 Jul 2022 15:19:02 +0000 Subject: [PATCH 027/164] Correct docs to reflect new range 1900..2299 for Date32 and DateTime64; Cleanup code --- docs/en/sql-reference/data-types/date32.md | 4 ++-- docs/en/sql-reference/data-types/datetime64.md | 2 +- .../functions/date-time-functions.md | 6 +++--- .../functions/type-conversion-functions.md | 16 ++++++++-------- docs/ru/sql-reference/data-types/date32.md | 4 ++-- docs/ru/sql-reference/data-types/datetime64.md | 2 +- .../functions/date-time-functions.md | 4 ++-- .../functions/type-conversion-functions.md | 18 +++++++++--------- docs/zh/sql-reference/data-types/datetime64.md | 2 +- .../functions/date-time-functions.md | 6 +++--- src/Common/DateLUTImpl.h | 14 +++++++++----- .../FunctionDateOrDateTimeAddInterval.h | 14 ++------------ src/Functions/FunctionsConversion.h | 1 - 13 files changed, 43 insertions(+), 50 deletions(-) diff --git a/docs/en/sql-reference/data-types/date32.md b/docs/en/sql-reference/data-types/date32.md index e1d6e2363e8..b5a82128e69 100644 --- a/docs/en/sql-reference/data-types/date32.md +++ b/docs/en/sql-reference/data-types/date32.md @@ -5,7 +5,7 @@ sidebar_label: Date32 # Date32 -A date. Supports the date range same with [Datetime64](../../sql-reference/data-types/datetime64.md). Stored in four bytes as the number of days since 1925-01-01. Allows storing values till 2283-11-11. +A date. Supports the date range same with [Datetime64](../../sql-reference/data-types/datetime64.md). Stored in four bytes as the number of days since 1900-01-01. Allows storing values till 2299-12-31. **Examples** @@ -36,5 +36,5 @@ SELECT * FROM new; - [toDate32](../../sql-reference/functions/type-conversion-functions.md#todate32) - [toDate32OrZero](../../sql-reference/functions/type-conversion-functions.md#todate32-or-zero) -- [toDate32OrNull](../../sql-reference/functions/type-conversion-functions.md#todate32-or-null) +- [toDate32OrNull](../../sql-reference/functions/type-conversion-functions.md#todate32-or-null) diff --git a/docs/en/sql-reference/data-types/datetime64.md b/docs/en/sql-reference/data-types/datetime64.md index ddc71e75e44..a5a520a978e 100644 --- a/docs/en/sql-reference/data-types/datetime64.md +++ b/docs/en/sql-reference/data-types/datetime64.md @@ -18,7 +18,7 @@ DateTime64(precision, [timezone]) Internally, stores data as a number of ‘ticks’ since epoch start (1970-01-01 00:00:00 UTC) as Int64. The tick resolution is determined by the precision parameter. Additionally, the `DateTime64` type can store time zone that is the same for the entire column, that affects how the values of the `DateTime64` type values are displayed in text format and how the values specified as strings are parsed (‘2020-01-01 05:00:01.000’). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata. See details in [DateTime](../../sql-reference/data-types/datetime.md). -Supported range of values: \[1925-01-01 00:00:00, 2283-11-11 23:59:59.99999999\] (Note: The precision of the maximum value is 8). +Supported range of values: \[1900-01-01 00:00:00, 2299-12-31 23:59:59.99999999\] (Note: The precision of the maximum value is 8). ## Examples diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 621429fb02c..7a843e1c87b 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -266,8 +266,8 @@ Result: └────────────────┘ ``` -:::note -The return type `toStartOf*` functions described below is `Date` or `DateTime`. Though these functions can take `DateTime64` as an argument, passing them a `DateTime64` that is out of the normal range (years 1925 - 2283) will give an incorrect result. +:::note +The return type `toStartOf*` functions described below is `Date` or `DateTime`. Though these functions can take `DateTime64` as an argument, passing them a `DateTime64` that is out of the normal range (years 1900 - 2299) will give an incorrect result. ::: ## toStartOfYear @@ -291,7 +291,7 @@ Returns the date. Rounds down a date or date with time to the first day of the month. Returns the date. -:::note +:::note The behavior of parsing incorrect dates is implementation specific. ClickHouse may return zero date, throw an exception or do “natural” overflow. ::: diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 3f4db831e3d..d0dc651958d 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -218,23 +218,23 @@ SELECT toDate32('1955-01-01') AS value, toTypeName(value); 2. The value is outside the range: ``` sql -SELECT toDate32('1924-01-01') AS value, toTypeName(value); +SELECT toDate32('1899-01-01') AS value, toTypeName(value); ``` ``` text -┌──────value─┬─toTypeName(toDate32('1925-01-01'))─┐ -│ 1925-01-01 │ Date32 │ +┌──────value─┬─toTypeName(toDate32('1899-01-01'))─┐ +│ 1900-01-01 │ Date32 │ └────────────┴────────────────────────────────────┘ ``` 3. With `Date`-type argument: ``` sql -SELECT toDate32(toDate('1924-01-01')) AS value, toTypeName(value); +SELECT toDate32(toDate('1899-01-01')) AS value, toTypeName(value); ``` ``` text -┌──────value─┬─toTypeName(toDate32(toDate('1924-01-01')))─┐ +┌──────value─┬─toTypeName(toDate32(toDate('1899-01-01')))─┐ │ 1970-01-01 │ Date32 │ └────────────┴────────────────────────────────────────────┘ ``` @@ -248,14 +248,14 @@ The same as [toDate32](#todate32) but returns the min value of [Date32](../../sq Query: ``` sql -SELECT toDate32OrZero('1924-01-01'), toDate32OrZero(''); +SELECT toDate32OrZero('1899-01-01'), toDate32OrZero(''); ``` Result: ``` text -┌─toDate32OrZero('1924-01-01')─┬─toDate32OrZero('')─┐ -│ 1925-01-01 │ 1925-01-01 │ +┌─toDate32OrZero('1899-01-01')─┬─toDate32OrZero('')─┐ +│ 1900-01-01 │ 1900-01-01 │ └──────────────────────────────┴────────────────────┘ ``` diff --git a/docs/ru/sql-reference/data-types/date32.md b/docs/ru/sql-reference/data-types/date32.md index 31b2258b70b..1fc5ff6e5e2 100644 --- a/docs/ru/sql-reference/data-types/date32.md +++ b/docs/ru/sql-reference/data-types/date32.md @@ -5,7 +5,7 @@ sidebar_label: Date32 # Date32 {#data_type-datetime32} -Дата. Поддерживается такой же диапазон дат, как для типа [Datetime64](../../sql-reference/data-types/datetime64.md). Значение хранится в четырех байтах и соответствует числу дней с 1925-01-01 по 2283-11-11. +Дата. Поддерживается такой же диапазон дат, как для типа [Datetime64](../../sql-reference/data-types/datetime64.md). Значение хранится в четырех байтах и соответствует числу дней с 1900-01-01 по 2299-12-31. **Пример** @@ -36,5 +36,5 @@ SELECT * FROM new; - [toDate32](../../sql-reference/functions/type-conversion-functions.md#todate32) - [toDate32OrZero](../../sql-reference/functions/type-conversion-functions.md#todate32-or-zero) -- [toDate32OrNull](../../sql-reference/functions/type-conversion-functions.md#todate32-or-null) +- [toDate32OrNull](../../sql-reference/functions/type-conversion-functions.md#todate32-or-null) diff --git a/docs/ru/sql-reference/data-types/datetime64.md b/docs/ru/sql-reference/data-types/datetime64.md index 8428c5b7309..0473d8256e9 100644 --- a/docs/ru/sql-reference/data-types/datetime64.md +++ b/docs/ru/sql-reference/data-types/datetime64.md @@ -18,7 +18,7 @@ DateTime64(precision, [timezone]) Данные хранятся в виде количества ‘тиков’, прошедших с момента начала эпохи (1970-01-01 00:00:00 UTC), в Int64. Размер тика определяется параметром precision. Дополнительно, тип `DateTime64` позволяет хранить часовой пояс, единый для всей колонки, который влияет на то, как будут отображаться значения типа `DateTime64` в текстовом виде и как будут парситься значения заданные в виде строк (‘2020-01-01 05:00:01.000’). Часовой пояс не хранится в строках таблицы (выборки), а хранится в метаданных колонки. Подробнее см. [DateTime](datetime.md). -Диапазон значений: \[1925-01-01 00:00:00, 2283-11-11 23:59:59.99999999\] (Примечание: Точность максимального значения составляет 8). +Диапазон значений: \[1900-01-01 00:00:00, 2299-12-31 23:59:59.99999999\] (Примечание: Точность максимального значения составляет 8). ## Примеры {#examples} diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index da48cd940a7..242861af0d9 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -57,7 +57,7 @@ toTimezone(value, timezone) **Аргументы** - `value` — время или дата с временем. [DateTime64](../../sql-reference/data-types/datetime64.md). -- `timezone` — часовой пояс для возвращаемого значения. [String](../../sql-reference/data-types/string.md). Этот аргумент является константой, потому что `toTimezone` изменяет часовой пояс столбца (часовой пояс является атрибутом типов `DateTime*`). +- `timezone` — часовой пояс для возвращаемого значения. [String](../../sql-reference/data-types/string.md). Этот аргумент является константой, потому что `toTimezone` изменяет часовой пояс столбца (часовой пояс является атрибутом типов `DateTime*`). **Возвращаемое значение** @@ -267,7 +267,7 @@ SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp; ``` :::note "Attention" - `Date` или `DateTime` это возвращаемый тип функций `toStartOf*`, который описан ниже. Несмотря на то, что эти функции могут принимать `DateTime64` в качестве аргумента, если переданное значение типа `DateTime64` выходит за пределы нормального диапазона (с 1925 по 2283 год), то это даст неверный результат. + `Date` или `DateTime` это возвращаемый тип функций `toStartOf*`, который описан ниже. Несмотря на то, что эти функции могут принимать `DateTime64` в качестве аргумента, если переданное значение типа `DateTime64` выходит за пределы нормального диапазона (с 1900 по 2299 год), то это даст неверный результат. ::: ## toStartOfYear {#tostartofyear} diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 946abddf3d0..71caeddea02 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -209,7 +209,7 @@ SELECT toDate32('1955-01-01') AS value, toTypeName(value); ``` ``` text -┌──────value─┬─toTypeName(toDate32('1925-01-01'))─┐ +┌──────value─┬─toTypeName(toDate32('1955-01-01'))─┐ │ 1955-01-01 │ Date32 │ └────────────┴────────────────────────────────────┘ ``` @@ -217,23 +217,23 @@ SELECT toDate32('1955-01-01') AS value, toTypeName(value); 2. Значение выходит за границы диапазона: ``` sql -SELECT toDate32('1924-01-01') AS value, toTypeName(value); +SELECT toDate32('1899-01-01') AS value, toTypeName(value); ``` ``` text -┌──────value─┬─toTypeName(toDate32('1925-01-01'))─┐ -│ 1925-01-01 │ Date32 │ +┌──────value─┬─toTypeName(toDate32('1899-01-01'))─┐ +│ 1900-01-01 │ Date32 │ └────────────┴────────────────────────────────────┘ ``` 3. С аргументом типа `Date`: ``` sql -SELECT toDate32(toDate('1924-01-01')) AS value, toTypeName(value); +SELECT toDate32(toDate('1899-01-01')) AS value, toTypeName(value); ``` ``` text -┌──────value─┬─toTypeName(toDate32(toDate('1924-01-01')))─┐ +┌──────value─┬─toTypeName(toDate32(toDate('1899-01-01')))─┐ │ 1970-01-01 │ Date32 │ └────────────┴────────────────────────────────────────────┘ ``` @@ -247,14 +247,14 @@ SELECT toDate32(toDate('1924-01-01')) AS value, toTypeName(value); Запрос: ``` sql -SELECT toDate32OrZero('1924-01-01'), toDate32OrZero(''); +SELECT toDate32OrZero('1899-01-01'), toDate32OrZero(''); ``` Результат: ``` text -┌─toDate32OrZero('1924-01-01')─┬─toDate32OrZero('')─┐ -│ 1925-01-01 │ 1925-01-01 │ +┌─toDate32OrZero('1899-01-01')─┬─toDate32OrZero('')─┐ +│ 1900-01-01 │ 1900-01-01 │ └──────────────────────────────┴────────────────────┘ ``` diff --git a/docs/zh/sql-reference/data-types/datetime64.md b/docs/zh/sql-reference/data-types/datetime64.md index 571bcffd66e..da637929180 100644 --- a/docs/zh/sql-reference/data-types/datetime64.md +++ b/docs/zh/sql-reference/data-types/datetime64.md @@ -19,7 +19,7 @@ DateTime64(precision, [timezone]) 在内部,此类型以Int64类型将数据存储为自Linux纪元开始(1970-01-01 00:00:00UTC)的时间刻度数(ticks)。时间刻度的分辨率由precision参数确定。此外,`DateTime64` 类型可以像存储其他数据列一样存储时区信息,时区会影响 `DateTime64` 类型的值如何以文本格式显示,以及如何解析以字符串形式指定的时间数据 (‘2020-01-01 05:00:01.000’)。时区不存储在表的行中(也不在resultset中),而是存储在列的元数据中。详细信息请参考 [DateTime](datetime.md) 数据类型. -值的范围: \[1925-01-01 00:00:00, 2283-11-11 23:59:59.99999999\] (注意: 最大值的精度是8)。 +值的范围: \[1900-01-01 00:00:00, 2299-12-31 23:59:59.99999999\] (注意: 最大值的精度是8)。 ## 示例 {#examples} diff --git a/docs/zh/sql-reference/functions/date-time-functions.md b/docs/zh/sql-reference/functions/date-time-functions.md index f268e9584ce..b9fdc4e21f2 100644 --- a/docs/zh/sql-reference/functions/date-time-functions.md +++ b/docs/zh/sql-reference/functions/date-time-functions.md @@ -263,8 +263,8 @@ SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp └────────────────┘ ``` -:::注意 -下面描述的返回类型 `toStartOf` 函数是 `Date` 或 `DateTime`。尽管这些函数可以将 `DateTime64` 作为参数,但将超出正常范围(1925年-2283年)的 `DateTime64` 传递给它们会给出不正确的结果。 +:::注意 +下面描述的返回类型 `toStartOf` 函数是 `Date` 或 `DateTime`。尽管这些函数可以将 `DateTime64` 作为参数,但将超出正常范围(1900年-2299年)的 `DateTime64` 传递给它们会给出不正确的结果。 ::: ## toStartOfYear {#tostartofyear} @@ -1221,4 +1221,4 @@ SELECT fromModifiedJulianDayOrNull(58849); └────────────────────────────────────┘ ``` -[Original article](https://clickhouse.com/docs/en/query_language/functions/date_time_functions/) \ No newline at end of file +[Original article](https://clickhouse.com/docs/en/query_language/functions/date_time_functions/) diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index d1b226e18b1..f38c585dcdd 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -185,11 +185,6 @@ public: static_assert(sizeof(Values) == 16); private: - - /// Mask is all-ones to allow efficient protection against overflow. - // static constexpr UInt32 date_lut_mask = 0x1ffff; - // static_assert(date_lut_mask == DATE_LUT_SIZE - 1); - /// Offset to epoch in days (ExtendedDayNum) of the first day in LUT. /// "epoch" is the Unix Epoch (starts at unix timestamp zero) static constexpr UInt32 daynum_offset_epoch = 25567; @@ -343,6 +338,15 @@ public: return ExtendedDayNum{static_cast(toLUTIndex(v).toUnderType() - daynum_offset_epoch)}; } + static UInt16 normalizeDayNum(Int64 d) + { + if (d < 0) + return 0; + if (d > 65535) + return 65535; + return static_cast(d); + } + /// Round down to start of monday. template inline Time toFirstDayOfWeek(DateOrTime v) const diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h index 10408093240..341a7cf504d 100644 --- a/src/Functions/FunctionDateOrDateTimeAddInterval.h +++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h @@ -288,12 +288,7 @@ struct AddDaysImpl static inline NO_SANITIZE_UNDEFINED UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl &, UInt16 = 0) { - Int64 r = d + delta; - if (r < 0) - return 0; - if (r > 65535) - return 65535; - return static_cast(r); + return DateLUT::instance().normalizeDayNum(d + delta); } static inline NO_SANITIZE_UNDEFINED Int32 execute(Int32 d, Int64 delta, const DateLUTImpl &, UInt16 = 0) @@ -327,12 +322,7 @@ struct AddWeeksImpl static inline NO_SANITIZE_UNDEFINED UInt16 execute(UInt16 d, Int32 delta, const DateLUTImpl &, UInt16 = 0) { - Int64 r = d + delta * 7; - if (r < 0) - return 0; - if (r > 65535) - return 65535; - return static_cast(r); + return DateLUT::instance().normalizeDayNum(d + delta * 7); } static inline NO_SANITIZE_UNDEFINED Int32 execute(Int32 d, Int32 delta, const DateLUTImpl &, UInt16 = 0) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 0b0a4e9f21b..af75e4f49ba 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -536,7 +536,6 @@ template struct ConvertImpl struct ConvertImpl : DateTimeTransformImpl> {}; - const time_t LUT_MIN_TIME = -2208988800l; // 1900-01-01 UTC const time_t LUT_MAX_TIME = 10382256000l; // 2299-12-31 UTC From 6028f7909b2775733935643d6ede16cf72436118 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 20 Jul 2022 20:45:29 +0000 Subject: [PATCH 028/164] Fixing build. --- src/Interpreters/AsynchronousMetrics.cpp | 4 ++-- .../test.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index b5dc9533f56..32c4e421ac3 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -664,12 +664,12 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti new_values["jemalloc.epoch"] = epoch; // Collect the statistics themselves. - size_t je_malloc_allocated = saveJemallocMetric(new_values, "allocated"); + [[maybe_unused]] size_t je_malloc_allocated = saveJemallocMetric(new_values, "allocated"); saveJemallocMetric(new_values, "active"); saveJemallocMetric(new_values, "metadata"); saveJemallocMetric(new_values, "metadata_thp"); saveJemallocMetric(new_values, "resident"); - size_t je_malloc_mapped = saveJemallocMetric(new_values, "mapped"); + [[maybe_unused]] size_t je_malloc_mapped = saveJemallocMetric(new_values, "mapped"); saveJemallocMetric(new_values, "retained"); saveJemallocMetric(new_values, "background_thread.num_threads"); saveJemallocMetric(new_values, "background_thread.num_runs"); diff --git a/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py b/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py index eba3aeff303..35c29959a43 100644 --- a/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py +++ b/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py @@ -43,7 +43,7 @@ def test_memory_tracking_total(): "-c", "clickhouse local -q \"SELECT arrayStringConcat(arrayMap(x->toString(cityHash64(x)), range(1000)), ' ') from numbers(10000)\" > data.json", ], - user="root" + user="root", ) for it in range(0, 20): From 6b568ffa32542e89073d58fdbc1d8c4f950fe10d Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 21 Jul 2022 09:03:53 +0000 Subject: [PATCH 029/164] Fix test --- .../02314_csv_tsv_skip_first_lines.reference | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.reference b/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.reference index 7d8e0c662cd..4274f5769ee 100644 --- a/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.reference +++ b/tests/queries/0_stateless/02314_csv_tsv_skip_first_lines.reference @@ -1,14 +1,14 @@ -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) 0 1 2 1 2 3 2 3 4 3 4 5 4 5 6 -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) 0 1 2 1 2 3 2 3 4 From 02c90fd94b30cc48c45393351f977fb9764ed0e0 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 21 Jul 2022 16:54:42 +0000 Subject: [PATCH 030/164] Fix tests --- tests/integration/test_storage_hdfs/test.py | 2 +- tests/integration/test_storage_s3/test.py | 4 +- .../02188_table_function_format.reference | 8 +- ...2211_shcema_inference_from_stdin.reference | 8 +- ...e_table_without_columns_metadata.reference | 2 +- ...column_names_in_shcmea_inference.reference | 8 +- .../0_stateless/02244_hdfs_cluster.reference | 24 ++-- ...247_names_order_in_json_and_tskv.reference | 24 ++-- ...02247_read_bools_as_numbers_json.reference | 4 +- .../02268_json_maps_and_objects.reference | 2 +- .../02286_mysql_dump_input_format.reference | 70 +++++------ .../02293_formats_json_columns.reference | 18 +-- ...modifiers_with_non-default_types.reference | 113 ++++++++++++++++++ 13 files changed, 200 insertions(+), 87 deletions(-) create mode 100644 tests/queries/0_stateless/02313_group_by_modifiers_with_non-default_types.reference diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 0490c0c1f0d..a9e6cbda67c 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -537,7 +537,7 @@ def test_schema_inference_with_globs(started_cluster): ) result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/data*.jsoncompacteachrow')") - assert result.strip() == "c1\tNullable(Float64)" + assert result.strip() == "c1\tNullable(Int64)" result = node1.query( f"select * from hdfs('hdfs://hdfs1:9000/data*.jsoncompacteachrow')" diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 5dd09ddd362..5b0600c0023 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1306,7 +1306,7 @@ def test_schema_inference_from_globs(started_cluster): result = instance.query( f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}')" ) - assert result.strip() == "c1\tNullable(Float64)" + assert result.strip() == "c1\tNullable(Int64)" result = instance.query( f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}')" @@ -1316,7 +1316,7 @@ def test_schema_inference_from_globs(started_cluster): result = instance.query( f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test*.jsoncompacteachrow')" ) - assert result.strip() == "c1\tNullable(Float64)" + assert result.strip() == "c1\tNullable(Int64)" result = instance.query( f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test*.jsoncompacteachrow')" diff --git a/tests/queries/0_stateless/02188_table_function_format.reference b/tests/queries/0_stateless/02188_table_function_format.reference index 403a4044544..5d7febda187 100644 --- a/tests/queries/0_stateless/02188_table_function_format.reference +++ b/tests/queries/0_stateless/02188_table_function_format.reference @@ -39,9 +39,9 @@ World 123 Hello 111 World 123 1 2 [1,2,3] [['abc'],[],['d','e']] -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Array(Nullable(Float64)) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Array(Nullable(Int64)) c4 Array(Array(Nullable(String))) Hello 111 World 123 @@ -49,4 +49,4 @@ Hello 111 Hello 131 World 123 a Nullable(String) -b Nullable(Float64) +b Nullable(Int64) diff --git a/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference b/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference index 6920aa16198..4a4389f638e 100644 --- a/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference +++ b/tests/queries/0_stateless/02211_shcema_inference_from_stdin.reference @@ -1,4 +1,4 @@ -x Nullable(Float64) +x Nullable(Int64) 0 1 2 @@ -9,7 +9,7 @@ x Nullable(Float64) 7 8 9 -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) 1 2 3 diff --git a/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference b/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference index f32b0eb8a92..869c6b58dd4 100644 --- a/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference +++ b/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference @@ -1,3 +1,3 @@ -CREATE TABLE default.test\n(\n `x` Nullable(Float64),\n `y` Nullable(String)\n)\nENGINE = File(\'JSONEachRow\', \'data.jsonl\') +CREATE TABLE test.test\n(\n `x` Nullable(Int64),\n `y` Nullable(String)\n)\nENGINE = File(\'JSONEachRow\', \'data.jsonl\') OK OK diff --git a/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference index d237caf630f..7a124346b9f 100644 --- a/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference +++ b/tests/queries/0_stateless/02244_column_names_in_shcmea_inference.reference @@ -1,8 +1,8 @@ x Nullable(String) -y Nullable(Float64) +y Nullable(Int64) x Nullable(String) -y Nullable(Float64) +y Nullable(Int64) x Nullable(String) -y Nullable(Float64) +y Nullable(Int64) x Nullable(String) -y Nullable(Float64) +y Nullable(Int64) diff --git a/tests/queries/0_stateless/02244_hdfs_cluster.reference b/tests/queries/0_stateless/02244_hdfs_cluster.reference index 4bf4799e904..32a9f24388c 100644 --- a/tests/queries/0_stateless/02244_hdfs_cluster.reference +++ b/tests/queries/0_stateless/02244_hdfs_cluster.reference @@ -22,24 +22,24 @@ 1 2 3 4 5 6 7 8 9 -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) c1 UInt32 c2 UInt32 c3 UInt32 c1 UInt32 c2 UInt32 c3 UInt32 -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) -c1 Nullable(Float64) -c2 Nullable(Float64) -c3 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Int64) c1 UInt32 c2 UInt32 c3 UInt32 diff --git a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference index 300846c17a0..c7774c8138b 100644 --- a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference +++ b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference @@ -1,32 +1,32 @@ -a Nullable(Float64) +a Nullable(Int64) b Nullable(String) -c Array(Nullable(Float64)) +c Array(Nullable(Int64)) 1 s1 [] 2 } [2] \N \N [] \N \N [] \N \N [3] -b Nullable(Float64) -a Nullable(Float64) -c Nullable(Float64) -e Nullable(Float64) +b Nullable(Int64) +a Nullable(Int64) +c Nullable(Int64) +e Nullable(Int64) 1 \N \N \N \N 2 3 \N \N \N \N \N \N \N \N 3 3 3 1 \N -a Nullable(Float64) +a Nullable(Int64) b Nullable(String) -c Array(Nullable(Float64)) +c Array(Nullable(Int64)) 1 s1 [] 2 \N [2] \N \N [] \N \N [] \N \N [3] -b Nullable(Float64) -a Nullable(Float64) -c Nullable(Float64) -e Nullable(Float64) +b Nullable(Int64) +a Nullable(Int64) +c Nullable(Int64) +e Nullable(Int64) 1 \N \N \N \N 2 3 \N \N \N \N \N diff --git a/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference b/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference index b6d10581b16..840e77cb122 100644 --- a/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference +++ b/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference @@ -10,7 +10,7 @@ x Nullable(Float64) x Array(Nullable(Float64)) [1,0] [0.42] -x Array(Array(Nullable(Float64))) +x Array(Array(Nullable(Int64))) [[1,2,3],[1,0],[1,1,0]] [[1,2,3]] c1 Nullable(Bool) @@ -25,6 +25,6 @@ c1 Nullable(Float64) c1 Array(Nullable(Float64)) [1,0] [0.42] -c1 Array(Array(Nullable(Float64))) +c1 Array(Array(Nullable(Int64))) [[1,2,3],[1,0],[1,1,0]] [[1,2,3]] diff --git a/tests/queries/0_stateless/02268_json_maps_and_objects.reference b/tests/queries/0_stateless/02268_json_maps_and_objects.reference index 73a8a8f43cf..87fb8949e01 100644 --- a/tests/queries/0_stateless/02268_json_maps_and_objects.reference +++ b/tests/queries/0_stateless/02268_json_maps_and_objects.reference @@ -2,4 +2,4 @@ x Object(Nullable(\'json\')) x Object(Nullable(\'json\')) x Array(Object(Nullable(\'json\'))) x Array(Object(Nullable(\'json\'))) -x Tuple(Map(String, Nullable(String)), Map(String, Array(Nullable(Float64))), Array(Nullable(Float64))) +x Tuple(Map(String, Nullable(String)), Map(String, Array(Nullable(Int64))), Array(Nullable(Int64))) diff --git a/tests/queries/0_stateless/02286_mysql_dump_input_format.reference b/tests/queries/0_stateless/02286_mysql_dump_input_format.reference index 25be4b727bc..a736358b9b7 100644 --- a/tests/queries/0_stateless/02286_mysql_dump_input_format.reference +++ b/tests/queries/0_stateless/02286_mysql_dump_input_format.reference @@ -130,8 +130,8 @@ x Nullable(Int32) x Nullable(Int32) 1 dump7 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) 1 \N 2 \N 3 \N @@ -139,8 +139,8 @@ c2 Nullable(Float64) 4 \N 5 \N 6 7 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) 1 \N 2 \N 3 \N @@ -148,15 +148,15 @@ c2 Nullable(Float64) 4 \N 5 \N 6 7 -c1 Nullable(Float64) +c1 Nullable(Int64) 1 2 3 -c1 Nullable(Float64) +c1 Nullable(Int64) 1 dump8 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) 1 \N 2 \N 3 \N @@ -164,8 +164,8 @@ c2 Nullable(Float64) 4 \N 5 \N 6 7 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) 1 \N 2 \N 3 \N @@ -174,8 +174,8 @@ c2 Nullable(Float64) 5 \N 6 7 dump9 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) 1 \N 2 \N 3 \N @@ -183,8 +183,8 @@ c2 Nullable(Float64) 4 \N 5 \N 6 7 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) 1 \N 2 \N 3 \N @@ -193,8 +193,8 @@ c2 Nullable(Float64) 5 \N 6 7 dump10 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) 1 \N 2 \N 3 \N @@ -202,8 +202,8 @@ c2 Nullable(Float64) 4 \N 5 \N 6 7 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) 1 \N 2 \N 3 \N @@ -212,8 +212,8 @@ c2 Nullable(Float64) 5 \N 6 7 dump11 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) 1 \N 2 \N 3 \N @@ -221,8 +221,8 @@ c2 Nullable(Float64) 4 \N 5 \N 6 7 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) 1 \N 2 \N 3 \N @@ -265,8 +265,8 @@ color Nullable(String) price Nullable(Int32) apple red 42 dump14 -x Nullable(Float64) -y Nullable(Float64) +x Nullable(Int64) +y Nullable(Int64) 1 \N 2 \N 3 \N @@ -274,8 +274,8 @@ y Nullable(Float64) 4 \N 5 \N 6 7 -x Nullable(Float64) -y Nullable(Float64) +x Nullable(Int64) +y Nullable(Int64) 1 \N 2 \N 3 \N @@ -283,15 +283,15 @@ y Nullable(Float64) 4 \N 5 \N 6 7 -x Nullable(Float64) +x Nullable(Int64) 1 2 3 -x Nullable(Float64) +x Nullable(Int64) 1 dump15 -x Nullable(Float64) -y Nullable(Float64) +x Nullable(Int64) +y Nullable(Int64) 1 \N 2 \N 3 \N @@ -299,8 +299,8 @@ y Nullable(Float64) 4 \N 5 \N 6 7 -x Nullable(Float64) -y Nullable(Float64) +x Nullable(Int64) +y Nullable(Int64) 1 \N 2 \N 3 \N @@ -308,10 +308,10 @@ y Nullable(Float64) 4 \N 5 \N 6 7 -x Nullable(Float64) +x Nullable(Int64) 1 2 3 -x Nullable(Float64) -y Nullable(Float64) +x Nullable(Int64) +y Nullable(Int64) 1 2 diff --git a/tests/queries/0_stateless/02293_formats_json_columns.reference b/tests/queries/0_stateless/02293_formats_json_columns.reference index da8d080ac05..f59f02ad42b 100644 --- a/tests/queries/0_stateless/02293_formats_json_columns.reference +++ b/tests/queries/0_stateless/02293_formats_json_columns.reference @@ -4,9 +4,9 @@ JSONColumns "b": ["String", "String", "String", "String", "String"], "c": [[[[],"String"],[[],"gnirtS"]], [[[0],"String"],[[0],"gnirtS"]], [[[0,1],"String"],[[0,1],"gnirtS"]], [[[],"String"],[[0,1,2],"gnirtS"]], [[[0],"String"],[[],"gnirtS"]]] } -a Nullable(Float64) +a Nullable(Int64) b Nullable(String) -c Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 0 String [([],'String'),([],'gnirtS')] 1 String [([0],'String'),([0],'gnirtS')] 2 String [([0,1],'String'),([0,1],'gnirtS')] @@ -18,9 +18,9 @@ JSONCompactColumns ["String", "String", "String", "String", "String"], [[[[],"String"],[[],"gnirtS"]], [[[0],"String"],[[0],"gnirtS"]], [[[0,1],"String"],[[0,1],"gnirtS"]], [[[],"String"],[[0,1,2],"gnirtS"]], [[[0],"String"],[[],"gnirtS"]]] ] -c1 Nullable(Float64) +c1 Nullable(Int64) c2 Nullable(String) -c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c3 Array(Tuple(Array(Nullable(Int64)), Nullable(String))) 0 String [([],'String'),([],'gnirtS')] 1 String [([0],'String'),([0],'gnirtS')] 2 String [([0,1],'String'),([0,1],'gnirtS')] @@ -74,9 +74,9 @@ JSONColumnsWithMetadata "bytes_read": 20 } } -b Nullable(Float64) -a Nullable(Float64) -c Nullable(Float64) +b Nullable(Int64) +a Nullable(Int64) +c Nullable(Int64) d Nullable(String) 1 3 \N \N 2 2 \N \N @@ -89,8 +89,8 @@ OK 3 2 1 -c1 Nullable(Float64) -c2 Nullable(Float64) +c1 Nullable(Int64) +c2 Nullable(Int64) c3 Nullable(String) 1 1 \N 2 2 \N diff --git a/tests/queries/0_stateless/02313_group_by_modifiers_with_non-default_types.reference b/tests/queries/0_stateless/02313_group_by_modifiers_with_non-default_types.reference new file mode 100644 index 00000000000..183c63d1222 --- /dev/null +++ b/tests/queries/0_stateless/02313_group_by_modifiers_with_non-default_types.reference @@ -0,0 +1,113 @@ +-- { echoOn } +SELECT + count() as d, a, b, c +FROM test02313 +GROUP BY ROLLUP(a, b, c) +ORDER BY d, a, b, c; +1 one default 0 +1 one default 2 +1 one default 4 +1 one default 6 +1 one default 8 +1 two non-default 1 +1 two non-default 3 +1 two non-default 5 +1 two non-default 7 +1 two non-default 9 +5 one default 0 +5 one default 0 +5 two default 0 +5 two non-default 0 +10 one default 0 +SELECT + count() as d, a, b, c +FROM test02313 +GROUP BY CUBE(a, b, c) +ORDER BY d, a, b, c; +1 one default 0 +1 one default 0 +1 one default 0 +1 one default 0 +1 one default 1 +1 one default 2 +1 one default 2 +1 one default 2 +1 one default 2 +1 one default 3 +1 one default 4 +1 one default 4 +1 one default 4 +1 one default 4 +1 one default 5 +1 one default 6 +1 one default 6 +1 one default 6 +1 one default 6 +1 one default 7 +1 one default 8 +1 one default 8 +1 one default 8 +1 one default 8 +1 one default 9 +1 one non-default 1 +1 one non-default 3 +1 one non-default 5 +1 one non-default 7 +1 one non-default 9 +1 two default 1 +1 two default 3 +1 two default 5 +1 two default 7 +1 two default 9 +1 two non-default 1 +1 two non-default 3 +1 two non-default 5 +1 two non-default 7 +1 two non-default 9 +5 one default 0 +5 one default 0 +5 one default 0 +5 one non-default 0 +5 two default 0 +5 two non-default 0 +10 one default 0 +SELECT + count() as d, a, b, c +FROM test02313 +GROUP BY GROUPING SETS + ( + (c), + (a, c), + (b, c) + ) +ORDER BY d, a, b, c; +1 one default 0 +1 one default 0 +1 one default 0 +1 one default 1 +1 one default 2 +1 one default 2 +1 one default 2 +1 one default 3 +1 one default 4 +1 one default 4 +1 one default 4 +1 one default 5 +1 one default 6 +1 one default 6 +1 one default 6 +1 one default 7 +1 one default 8 +1 one default 8 +1 one default 8 +1 one default 9 +1 one non-default 1 +1 one non-default 3 +1 one non-default 5 +1 one non-default 7 +1 one non-default 9 +1 two default 1 +1 two default 3 +1 two default 5 +1 two default 7 +1 two default 9 From 74d2bf8d95a901dbefda5425bb9cdef1ebaaf96c Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 22 Jul 2022 07:49:02 +0000 Subject: [PATCH 031/164] Fix gTest for DateLUTTest --- src/Common/tests/gtest_DateLUTImpl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/tests/gtest_DateLUTImpl.cpp b/src/Common/tests/gtest_DateLUTImpl.cpp index 95cad92efca..49013625ed3 100644 --- a/src/Common/tests/gtest_DateLUTImpl.cpp +++ b/src/Common/tests/gtest_DateLUTImpl.cpp @@ -84,8 +84,8 @@ TEST(DateLUTTest, makeDayNumTest) EXPECT_EQ(-25567, lut.makeDayNum(1900, 1, 1)); EXPECT_EQ(-16436, lut.makeDayNum(1925, 1, 1)); EXPECT_EQ(0, lut.makeDayNum(1970, 1, 1)); - EXPECT_EQ(120894, lut.makeDayNum(2300, 12, 31)); - EXPECT_EQ(120894, lut.makeDayNum(2500, 12, 25)); + EXPECT_EQ(120529, lut.makeDayNum(2300, 12, 31)); + EXPECT_EQ(120529, lut.makeDayNum(2500, 12, 25)); } From 0b102c6d1ffeeeae79c3f50a522288367a567a46 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 22 Jul 2022 08:24:05 +0000 Subject: [PATCH 032/164] Fix code style --- src/Common/DateLUTImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index f38c585dcdd..56f2e87ccb5 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -74,7 +74,7 @@ private: static inline LUTIndex normalizeLUTIndex(Int64 index) { - if unlikely(index < 0 ) + if (unlikely(index < 0)) return LUTIndex(0); if (index >= DATE_LUT_SIZE) return LUTIndex(DATE_LUT_SIZE - 1); From 3c2449eb3dc25140e87e3c9e53c73a33faea0438 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 25 Jul 2022 17:31:21 +0200 Subject: [PATCH 033/164] Fix tests --- .../02222_create_table_without_columns_metadata.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference b/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference index 869c6b58dd4..effc3644b41 100644 --- a/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference +++ b/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference @@ -1,3 +1,3 @@ -CREATE TABLE test.test\n(\n `x` Nullable(Int64),\n `y` Nullable(String)\n)\nENGINE = File(\'JSONEachRow\', \'data.jsonl\') +CREATE TABLE default.test\n(\n `x` Nullable(Int64),\n `y` Nullable(String)\n)\nENGINE = File(\'JSONEachRow\', \'data.jsonl\') OK OK From b412ea5f6d85c68d1cb246df2d17b93e8c95720a Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Mon, 25 Jul 2022 17:06:11 +0000 Subject: [PATCH 034/164] Improve generateRandom() for Date32; fix tests 01087_table_function_generate, 01277_fromUnixTimestamp64, 01691_DateTime64_clamp and 01702_toDateTime_from_string_clamping --- src/Common/DateLUTImpl.h | 4 +++- src/Functions/FunctionsConversion.h | 2 +- src/Storages/StorageGenerateRandom.cpp | 5 ++++- .../01087_table_function_generate.reference | 20 +++++++++---------- .../01277_fromUnixTimestamp64.reference | 4 ++-- .../0_stateless/01277_fromUnixTimestamp64.sql | 18 ++++++++--------- .../01691_DateTime64_clamp.reference | 8 ++++---- ..._toDateTime_from_string_clamping.reference | 6 +++--- .../01702_toDateTime_from_string_clamping.sql | 4 ++-- 9 files changed, 38 insertions(+), 33 deletions(-) diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index 56f2e87ccb5..fbbd29d7a2b 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -19,8 +19,10 @@ #define DATE_LUT_MAX (0xFFFFFFFFU - 86400) #define DATE_LUT_MAX_DAY_NUM 0xFFFF +#define DAYNUM_OFFSET_EPOCH 25567 + /// Max int value of Date32, DATE LUT cache size minus daynum_offset_epoch -#define DATE_LUT_MAX_EXTEND_DAY_NUM (DATE_LUT_SIZE - 25567) +#define DATE_LUT_MAX_EXTEND_DAY_NUM (DATE_LUT_SIZE - DAYNUM_OFFSET_EPOCH) /// A constant to add to time_t so every supported time point becomes non-negative and still has the same remainder of division by 3600. /// If we treat "remainder of division" operation in the sense of modular arithmetic (not like in C++). diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index af75e4f49ba..62f95031e58 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -538,7 +538,7 @@ template struct ConvertImplgetData().resize(limit); - fillBufferWithRandomData(reinterpret_cast(column->getData().data()), limit * sizeof(Int32), rng); + + for (size_t i = 0; i < limit; ++i) + column->getData()[i] = (rng() % static_cast(DATE_LUT_SIZE)) - DAYNUM_OFFSET_EPOCH; + return column; } case TypeIndex::UInt32: [[fallthrough]]; diff --git a/tests/queries/0_stateless/01087_table_function_generate.reference b/tests/queries/0_stateless/01087_table_function_generate.reference index d62ff5618fc..53792bfb579 100644 --- a/tests/queries/0_stateless/01087_table_function_generate.reference +++ b/tests/queries/0_stateless/01087_table_function_generate.reference @@ -70,16 +70,16 @@ DateTime64(3, \'UTC\') DateTime64(6, \'UTC\') DateTime64(6, \'UTC\') 1992-12-28 09:26:04.030 1971-07-29 06:20:38.230976 1980-03-26 15:49:55.428516 2051-12-11 07:09:13.162 1982-01-12 00:25:45.754492 2010-05-17 07:01:28.452864 Date32 -1934-01-06 -2039-08-16 -2103-11-03 -2064-08-14 -2187-08-21 -2099-04-08 -1947-06-22 -2012-01-19 -2170-07-09 -2263-01-17 +2120-04-24 +1908-10-02 +2105-09-04 +2129-03-23 +1921-04-05 +2020-04-14 +2251-12-25 +2266-03-27 +2161-02-18 +2172-07-24 - Float32 Float64 -1.3551149e32 1.2262973812461839e235 diff --git a/tests/queries/0_stateless/01277_fromUnixTimestamp64.reference b/tests/queries/0_stateless/01277_fromUnixTimestamp64.reference index a9ffd259af0..28006c1d168 100644 --- a/tests/queries/0_stateless/01277_fromUnixTimestamp64.reference +++ b/tests/queries/0_stateless/01277_fromUnixTimestamp64.reference @@ -4,6 +4,6 @@ Asia/Makassar 1234567891011 2009-02-14 07:31:31.011 1970-01-15 14:56:07.891011 1 non-const column 1234567891011 2009-02-13 23:31:31.011 1970-01-15 06:56:07.891011 1970-01-01 00:20:34.567891011 upper range bound -9904447342 2283-11-10 19:22:22.123 2283-11-10 19:22:22.123456 1925-01-01 00:00:00.413905173 +10413688942 2299-12-30 19:22:22.123 2299-12-30 19:22:22.123456 1900-01-01 00:00:00.413905173 lower range bound --1420066799 1925-01-01 01:00:00.877 1925-01-01 01:00:00.876544 1925-01-01 01:00:00.876543211 +-2208985199 1900-01-01 01:00:00.877 1900-01-01 01:00:00.876544 1900-01-01 01:00:00.876543211 diff --git a/tests/queries/0_stateless/01277_fromUnixTimestamp64.sql b/tests/queries/0_stateless/01277_fromUnixTimestamp64.sql index e76a4db7a27..846ffa094a5 100644 --- a/tests/queries/0_stateless/01277_fromUnixTimestamp64.sql +++ b/tests/queries/0_stateless/01277_fromUnixTimestamp64.sql @@ -46,10 +46,10 @@ SELECT SELECT 'upper range bound'; WITH - 9904447342 AS timestamp, - CAST(9904447342123 AS Int64) AS milli, - CAST(9904447342123456 AS Int64) AS micro, - CAST(9904447342123456789 AS Int64) AS nano, + 10413688942 AS timestamp, + CAST(10413688942123 AS Int64) AS milli, + CAST(10413688942123456 AS Int64) AS micro, + CAST(10413688942123456789 AS Int64) AS nano, 'UTC' AS tz SELECT timestamp, @@ -59,13 +59,13 @@ SELECT SELECT 'lower range bound'; WITH - -1420066799 AS timestamp, - CAST(-1420066799123 AS Int64) AS milli, - CAST(-1420066799123456 AS Int64) AS micro, - CAST(-1420066799123456789 AS Int64) AS nano, + -2208985199 AS timestamp, + CAST(-2208985199123 AS Int64) AS milli, + CAST(-2208985199123456 AS Int64) AS micro, + CAST(-2208985199123456789 AS Int64) AS nano, 'UTC' AS tz SELECT timestamp, fromUnixTimestamp64Milli(milli, tz), fromUnixTimestamp64Micro(micro, tz), - fromUnixTimestamp64Nano(nano, tz); \ No newline at end of file + fromUnixTimestamp64Nano(nano, tz); diff --git a/tests/queries/0_stateless/01691_DateTime64_clamp.reference b/tests/queries/0_stateless/01691_DateTime64_clamp.reference index 6272103440c..75435aebd67 100644 --- a/tests/queries/0_stateless/01691_DateTime64_clamp.reference +++ b/tests/queries/0_stateless/01691_DateTime64_clamp.reference @@ -18,10 +18,10 @@ SELECT toDateTime64(toFloat32(bitShiftLeft(toUInt64(1),33)), 2, 'Asia/Istanbul') SELECT toDateTime64(toFloat64(bitShiftLeft(toUInt64(1),33)), 2, 'Asia/Istanbul') FORMAT Null; -- These are outsize of extended range and hence clamped SELECT toDateTime64(-1 * bitShiftLeft(toUInt64(1), 35), 2, 'Asia/Istanbul'); -1925-01-01 02:00:00.00 +1900-01-01 01:56:56.00 SELECT CAST(-1 * bitShiftLeft(toUInt64(1), 35) AS DateTime64(3, 'Asia/Istanbul')); -1925-01-01 02:00:00.000 +1900-01-01 01:56:56.000 SELECT CAST(bitShiftLeft(toUInt64(1), 35) AS DateTime64(3, 'Asia/Istanbul')); -2282-12-31 03:00:00.000 +2299-12-31 23:59:59.000 SELECT toDateTime64(bitShiftLeft(toUInt64(1), 35), 2, 'Asia/Istanbul'); -2282-12-31 03:00:00.00 +2299-12-31 23:59:59.00 diff --git a/tests/queries/0_stateless/01702_toDateTime_from_string_clamping.reference b/tests/queries/0_stateless/01702_toDateTime_from_string_clamping.reference index f27bf42c7a5..ecea0a9f69f 100644 --- a/tests/queries/0_stateless/01702_toDateTime_from_string_clamping.reference +++ b/tests/queries/0_stateless/01702_toDateTime_from_string_clamping.reference @@ -1,4 +1,4 @@ 1940-10-09 21:13:17.6 -2283-11-11 23:46:43.6 -2283-11-11 23:46:40.1 -1925-01-01 00:00:00.9 +2284-06-04 23:46:43.6 +2299-12-31 23:40:00.1 +1900-01-01 00:00:00.9 diff --git a/tests/queries/0_stateless/01702_toDateTime_from_string_clamping.sql b/tests/queries/0_stateless/01702_toDateTime_from_string_clamping.sql index b0dbd1dfc84..e84bb35b3a5 100644 --- a/tests/queries/0_stateless/01702_toDateTime_from_string_clamping.sql +++ b/tests/queries/0_stateless/01702_toDateTime_from_string_clamping.sql @@ -1,4 +1,4 @@ SELECT toString(toDateTime('-922337203.6854775808', 1, 'Asia/Istanbul')); SELECT toString(toDateTime('9922337203.6854775808', 1, 'Asia/Istanbul')); -SELECT toDateTime64(CAST('10000000000.1' AS Decimal64(1)), 1, 'Asia/Istanbul'); -SELECT toDateTime64(CAST('-10000000000.1' AS Decimal64(1)), 1, 'Asia/Istanbul'); +SELECT toDateTime64(CAST('10500000000.1' AS Decimal64(1)), 1, 'Asia/Istanbul'); +SELECT toDateTime64(CAST('-10500000000.1' AS Decimal64(1)), 1, 'Asia/Istanbul'); From fe79b08a7cb20c1af62bc7dbb11ad9e95ae7f4f2 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Mon, 25 Jul 2022 18:38:48 +0000 Subject: [PATCH 035/164] Fix test 02346_non_negative_derivative --- .../02346_non_negative_derivative.reference | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/queries/0_stateless/02346_non_negative_derivative.reference b/tests/queries/0_stateless/02346_non_negative_derivative.reference index 51fe2359bd0..b81af45962e 100644 --- a/tests/queries/0_stateless/02346_non_negative_derivative.reference +++ b/tests/queries/0_stateless/02346_non_negative_derivative.reference @@ -1,63 +1,63 @@ 1 1979-12-12 21:21:21.127 3.7 0 -2283-11-11 23:37:36.788 1.1 0 -2283-11-11 23:37:36.789 2.34 0 +2299-12-31 23:37:36.788 1.1 0 +2299-12-31 23:37:36.789 2.34 0 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 0.20550000000000002 1979-12-12 21:21:23.000 1.54 0 1979-12-12 21:21:21.127 3.7 0 -2283-11-11 23:37:36.788 1.1 0 -2283-11-11 23:37:36.789 2.34 0 +2299-12-31 23:37:36.788 1.1 0 +2299-12-31 23:37:36.789 2.34 0 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 6.165000000000001e-10 1979-12-12 21:21:23.000 1.54 0 1979-12-12 21:21:21.127 3.7 0 -2283-11-11 23:37:36.788 1.1 0 -2283-11-11 23:37:36.789 2.34 0 +2299-12-31 23:37:36.788 1.1 0 +2299-12-31 23:37:36.789 2.34 0 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 8.22e-7 1979-12-12 21:21:23.000 1.54 0 1979-12-12 21:21:21.127 3.7 0 -2283-11-11 23:37:36.788 1.1 0 -2283-11-11 23:37:36.789 2.34 0 +2299-12-31 23:37:36.788 1.1 0 +2299-12-31 23:37:36.789 2.34 0 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 0.0010275000000000002 1979-12-12 21:21:23.000 1.54 0 1979-12-12 21:21:21.127 3.7 0 -2283-11-11 23:37:36.788 1.1 0 -2283-11-11 23:37:36.789 2.34 0 +2299-12-31 23:37:36.788 1.1 0 +2299-12-31 23:37:36.789 2.34 0 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 1.233 1979-12-12 21:21:23.000 1.54 0 1979-12-12 21:21:21.127 3.7 0 -2283-11-11 23:37:36.788 1.1 0 -2283-11-11 23:37:36.789 2.34 0 +2299-12-31 23:37:36.788 1.1 0 +2299-12-31 23:37:36.789 2.34 0 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 86.31 1979-12-12 21:21:23.000 1.54 0 1979-12-12 21:21:21.127 3.7 0 -2283-11-11 23:37:36.788 1.1 0 -2283-11-11 23:37:36.789 2.34 0 +2299-12-31 23:37:36.788 1.1 0 +2299-12-31 23:37:36.789 2.34 0 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 5918.400000000001 1979-12-12 21:21:23.000 1.54 0 1979-12-12 21:21:21.127 3.7 0 -2283-11-11 23:37:36.788 1.1 0 -2283-11-11 23:37:36.789 2.34 0 +2299-12-31 23:37:36.788 1.1 0 +2299-12-31 23:37:36.789 2.34 0 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 159796.80000000002 1979-12-12 21:21:23.000 1.54 0 1979-12-12 21:21:21.127 3.7 0 -2283-11-11 23:37:36.788 1.1 0 -2283-11-11 23:37:36.789 2.34 0 +2299-12-31 23:37:36.788 1.1 0 +2299-12-31 23:37:36.789 2.34 0 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 1242864 From f135a82141b559a8ceb67584743886005211f3d7 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 Jul 2022 19:26:22 -0700 Subject: [PATCH 036/164] Addressed changes requested by @evillique (Restructured OwnJSONPatternFormatter, used different version of writeJSONString() etc... --- src/Daemon/BaseDaemon.cpp | 52 ++++++++-------- src/Loggers/Loggers.cpp | 62 ++++++------------ src/Loggers/OwnFormattingChannel.cpp | 13 ++-- src/Loggers/OwnJSONPatternFormatter.cpp | 83 ++++++++++++++----------- src/Loggers/OwnJSONPatternFormatter.h | 5 +- src/Loggers/OwnPatternFormatter.cpp | 10 +-- src/Loggers/OwnPatternFormatter.h | 1 + 7 files changed, 110 insertions(+), 116 deletions(-) diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 95967bb2b82..81317aa3b6a 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -5,17 +5,17 @@ #include #include +#include #include #include #include #include -#include #if defined(OS_LINUX) - #include +# include #endif #include -#include #include +#include #include #include @@ -71,11 +71,11 @@ namespace fs = std::filesystem; namespace DB { - namespace ErrorCodes - { - extern const int CANNOT_SET_SIGNAL_HANDLER; - extern const int CANNOT_SEND_SIGNAL; - } +namespace ErrorCodes +{ + extern const int CANNOT_SET_SIGNAL_HANDLER; + extern const int CANNOT_SEND_SIGNAL; +} } DB::PipeFDs signal_pipe; @@ -367,8 +367,11 @@ private: String calculated_binary_hash = getHashOfLoadedBinaryHex(); if (daemon.stored_binary_hash.empty()) { - LOG_FATAL(log, "Integrity check of the executable skipped because the reference checksum could not be read." - " (calculated checksum: {})", calculated_binary_hash); + LOG_FATAL( + log, + "Integrity check of the executable skipped because the reference checksum could not be read." + " (calculated checksum: {})", + calculated_binary_hash); } else if (calculated_binary_hash == daemon.stored_binary_hash) { @@ -376,15 +379,18 @@ private: } else { - LOG_FATAL(log, "Calculated checksum of the executable ({0}) does not correspond" + LOG_FATAL( + log, + "Calculated checksum of the executable ({0}) does not correspond" " to the reference checksum stored in the executable ({1})." " This may indicate one of the following:" " - the executable was changed just after startup;" " - the executable was corrupted on disk due to faulty hardware;" " - the loaded executable was corrupted in memory due to faulty hardware;" " - the file was intentionally modified;" - " - a logical error in the code." - , calculated_binary_hash, daemon.stored_binary_hash); + " - a logical error in the code.", + calculated_binary_hash, + daemon.stored_binary_hash); } #endif @@ -1009,18 +1015,15 @@ void BaseDaemon::setupWatchdog() /// If streaming compression of logs is used then we write watchdog logs to cerr if (config().getRawString("logger.stream_compress", "false") == "true") { + Poco::AutoPtr pf; + if (config().has("logger.json")) - { - Poco::AutoPtr pf = new OwnJSONPatternFormatter; - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); - logger().setChannel(log); - } + pf = new OwnJSONPatternFormatter; else - { - Poco::AutoPtr pf = new OwnPatternFormatter; - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); - logger().setChannel(log); - } + pf = new OwnPatternFormatter(true); + + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); + logger().setChannel(log); } logger().information(fmt::format("Will watch for the process with pid {}", pid)); @@ -1028,8 +1031,7 @@ void BaseDaemon::setupWatchdog() /// Forward signals to the child process. addSignalHandler( {SIGHUP, SIGINT, SIGQUIT, SIGTERM}, - [](int sig, siginfo_t *, void *) - { + [](int sig, siginfo_t *, void *) { /// Forward all signals except INT as it can be send by terminal to the process group when user press Ctrl+C, /// and we process double delivery of this signal as immediate termination. if (sig == SIGINT) diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index fdcd75f761c..c02363c6017 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -96,22 +96,16 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log log_file->setProperty(Poco::FileChannel::PROP_ROTATEONOPEN, config.getRawString("logger.rotateOnOpen", "false")); log_file->open(); + Poco::AutoPtr pf; + if (config.has("logger.json")) - { - Poco::AutoPtr pf = new OwnJSONPatternFormatter; - - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, log_file); - log->setLevel(log_level); - split->addChannel(log, "log"); - } + pf = new OwnJSONPatternFormatter; else - { - Poco::AutoPtr pf = new OwnPatternFormatter; + pf = new OwnPatternFormatter(true); - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, log_file); - log->setLevel(log_level); - split->addChannel(log, "log"); - } + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, log_file); + log->setLevel(log_level); + split->addChannel(log, "log"); } const auto errorlog_path = config.getString("logger.errorlog", ""); @@ -143,24 +137,16 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log error_log_file->setProperty(Poco::FileChannel::PROP_FLUSH, config.getRawString("logger.flush", "true")); error_log_file->setProperty(Poco::FileChannel::PROP_ROTATEONOPEN, config.getRawString("logger.rotateOnOpen", "false")); + Poco::AutoPtr pf; if (config.has("logger.json")) - { - Poco::AutoPtr pf = new OwnJSONPatternFormatter; - - Poco::AutoPtr errorlog = new DB::OwnFormattingChannel(pf, error_log_file); - errorlog->setLevel(errorlog_level); - errorlog->open(); - split->addChannel(errorlog, "errorlog"); - } + pf = new OwnJSONPatternFormatter; else - { - Poco::AutoPtr pf = new OwnPatternFormatter; + pf = new OwnPatternFormatter(true); - Poco::AutoPtr errorlog = new DB::OwnFormattingChannel(pf, error_log_file); - errorlog->setLevel(errorlog_level); - errorlog->open(); - split->addChannel(errorlog, "errorlog"); - } + Poco::AutoPtr errorlog = new DB::OwnFormattingChannel(pf, error_log_file); + errorlog->setLevel(errorlog_level); + errorlog->open(); + split->addChannel(errorlog, "errorlog"); } if (config.getBool("logger.use_syslog", false)) @@ -193,25 +179,17 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log syslog_channel->setProperty(Poco::SyslogChannel::PROP_FACILITY, config.getString("logger.syslog.facility", "LOG_DAEMON")); } syslog_channel->open(); + Poco::AutoPtr pf; if (config.has("logger.json")) - { - Poco::AutoPtr pf = new OwnJSONPatternFormatter; - - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, syslog_channel); - log->setLevel(syslog_level); - - split->addChannel(log, "syslog"); - } + pf = new OwnJSONPatternFormatter; else - { - Poco::AutoPtr pf = new OwnPatternFormatter; + pf = new OwnPatternFormatter(true); - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, syslog_channel); - log->setLevel(syslog_level); + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, syslog_channel); + log->setLevel(syslog_level); - split->addChannel(log, "syslog"); - } + split->addChannel(log, "syslog"); } bool should_log_to_console = isatty(STDIN_FILENO) || isatty(STDERR_FILENO); diff --git a/src/Loggers/OwnFormattingChannel.cpp b/src/Loggers/OwnFormattingChannel.cpp index 35f035d44ce..a023c28c5ae 100644 --- a/src/Loggers/OwnFormattingChannel.cpp +++ b/src/Loggers/OwnFormattingChannel.cpp @@ -7,16 +7,13 @@ void OwnFormattingChannel::logExtended(const ExtendedLogMessage & msg) { if (pChannel && priority >= msg.base.getPriority()) { - if (pFormatterJSON) + if (pFormatterJSON || pFormatter) { std::string text; - pFormatterJSON->formatExtendedJSON(msg, text); - pChannel->log(Poco::Message(msg.base, text)); - } - else if (pFormatter) - { - std::string text; - pFormatter->formatExtended(msg, text); + if (pFormatterJSON) + pFormatterJSON->formatExtended(msg, text); + else + pFormatter->formatExtended(msg, text); pChannel->log(Poco::Message(msg.base, text)); } else diff --git a/src/Loggers/OwnJSONPatternFormatter.cpp b/src/Loggers/OwnJSONPatternFormatter.cpp index 825cb1e9806..63f2c60f70e 100644 --- a/src/Loggers/OwnJSONPatternFormatter.cpp +++ b/src/Loggers/OwnJSONPatternFormatter.cpp @@ -8,23 +8,24 @@ #include #include -OwnJSONPatternFormatter::OwnJSONPatternFormatter() : Poco::PatternFormatter("") +OwnJSONPatternFormatter::OwnJSONPatternFormatter() : OwnPatternFormatter("") { } -void OwnJSONPatternFormatter::formatExtendedJSON(const DB::ExtendedLogMessage & msg_ext, std::string & text) +void OwnJSONPatternFormatter::formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text) { DB::WriteBufferFromString wb(text); DB::FormatSettings settings; - String key_name; + char key_name[] = "a placeholder for key names in structured logging"; + char empty_string[] = ""; const Poco::Message & msg = msg_ext.base; DB::writeChar('{', wb); - key_name = "date_time"; - writeJSONString(StringRef(key_name), wb, settings); + strcpy(key_name, "date_time"); + writeJSONString(key_name, key_name + strlen(key_name), wb, settings); DB::writeChar(':', wb); DB::writeChar('\"', wb); @@ -41,15 +42,20 @@ void OwnJSONPatternFormatter::formatExtendedJSON(const DB::ExtendedLogMessage & DB::writeChar(',', wb); - key_name = "thread_name"; - writeJSONString(StringRef(key_name), wb, settings); + strcpy(key_name, "thread_name"); + writeJSONString(key_name, key_name + strlen(key_name), wb, settings); DB::writeChar(':', wb); - writeJSONString(StringRef(msg.getThread()), wb, settings); + + const char * thread_name = msg.getThread().c_str(); + if (thread_name != nullptr) + writeJSONString(thread_name, thread_name + strlen(thread_name), wb, settings); + else + writeJSONString(empty_string, empty_string + strlen(empty_string), wb, settings); DB::writeChar(',', wb); - key_name = "thread_id"; - writeJSONString(StringRef(key_name), wb, settings); + strcpy(key_name, "thread_id"); + writeJSONString(key_name, key_name + strlen(key_name), wb, settings); DB::writeChar(':', wb); DB::writeChar('\"', wb); DB::writeIntText(msg_ext.thread_id, wb); @@ -57,58 +63,65 @@ void OwnJSONPatternFormatter::formatExtendedJSON(const DB::ExtendedLogMessage & DB::writeChar(',', wb); - key_name = "level"; - writeJSONString(StringRef(key_name), wb, settings); + strcpy(key_name, "level"); + writeJSONString(key_name, key_name + strlen(key_name), wb, settings); DB::writeChar(':', wb); - int priority = static_cast(msg.getPriority()); - writeJSONString(StringRef(getPriorityName(priority)), wb, settings); + int priority_int = static_cast(msg.getPriority()); + String priority_str = std::to_string(priority_int); + const char * priority = priority_str.c_str(); + if (priority != nullptr) + writeJSONString(priority, priority + strlen(priority), wb, settings); + else + writeJSONString(empty_string, empty_string + strlen(empty_string), wb, settings); DB::writeChar(',', wb); /// We write query_id even in case when it is empty (no query context) /// just to be convenient for various log parsers. - key_name = "query_id"; - writeJSONString(StringRef(key_name), wb, settings); + strcpy(key_name, "query_id"); + writeJSONString(key_name, key_name + strlen(key_name), wb, settings); DB::writeChar(':', wb); writeJSONString(msg_ext.query_id, wb, settings); DB::writeChar(',', wb); - key_name = "logger_name"; - writeJSONString(StringRef(key_name), wb, settings); + strcpy(key_name, "logger_name"); + writeJSONString(key_name, key_name + strlen(key_name), wb, settings); DB::writeChar(':', wb); - writeJSONString(StringRef(msg.getSource()), wb, settings); + const char * logger_name = msg.getSource().c_str(); + if (logger_name != nullptr) + writeJSONString(logger_name, logger_name + strlen(logger_name), wb, settings); + else + writeJSONString(empty_string, empty_string + strlen(empty_string), wb, settings); DB::writeChar(',', wb); - key_name = "message"; - writeJSONString(StringRef(key_name), wb, settings); + strcpy(key_name, "message"); + writeJSONString(key_name, key_name + strlen(key_name), wb, settings); DB::writeChar(':', wb); - String msg_text = msg.getText(); - writeJSONString(StringRef(msg_text), wb, settings); + const char * msg_text = msg.getText().c_str(); + if (msg_text != nullptr) + writeJSONString(msg_text, msg_text + strlen(msg_text), wb, settings); + else + writeJSONString(empty_string, empty_string + strlen(empty_string), wb, settings); DB::writeChar(',', wb); - key_name = "source_file"; - writeJSONString(StringRef(key_name), wb, settings); + strcpy(key_name, "source_file"); + writeJSONString(key_name, key_name + strlen(key_name), wb, settings); DB::writeChar(':', wb); const char * source_file = msg.getSourceFile(); if (source_file != nullptr) - { - writeJSONString(StringRef(source_file), wb, settings); - } - + writeJSONString(source_file, source_file + strlen(source_file), wb, settings); else - { - writeJSONString(StringRef(""), wb, settings); - } + writeJSONString(empty_string, empty_string + strlen(empty_string), wb, settings); DB::writeChar(',', wb); - key_name = "source_line"; - writeJSONString(StringRef(key_name), wb, settings); + strcpy(key_name, "source_line"); + writeJSONString(key_name, key_name + strlen(key_name), wb, settings); DB::writeChar(':', wb); DB::writeChar('\"', wb); DB::writeIntText(msg.getSourceLine(), wb); @@ -119,5 +132,5 @@ void OwnJSONPatternFormatter::formatExtendedJSON(const DB::ExtendedLogMessage & void OwnJSONPatternFormatter::format(const Poco::Message & msg, std::string & text) { - formatExtendedJSON(DB::ExtendedLogMessage::getFrom(msg), text); + formatExtended(DB::ExtendedLogMessage::getFrom(msg), text); } diff --git a/src/Loggers/OwnJSONPatternFormatter.h b/src/Loggers/OwnJSONPatternFormatter.h index 0523869aebb..76a0104317e 100644 --- a/src/Loggers/OwnJSONPatternFormatter.h +++ b/src/Loggers/OwnJSONPatternFormatter.h @@ -3,6 +3,7 @@ #include #include "ExtendedLogChannel.h" +#include "OwnPatternFormatter.h" /** Format log messages own way in JSON. @@ -21,11 +22,11 @@ class Loggers; -class OwnJSONPatternFormatter : public Poco::PatternFormatter +class OwnJSONPatternFormatter : public OwnPatternFormatter { public: OwnJSONPatternFormatter(); void format(const Poco::Message & msg, std::string & text) override; - static void formatExtendedJSON(const DB::ExtendedLogMessage & msg_ext, std::string & text); + static void formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text); }; diff --git a/src/Loggers/OwnPatternFormatter.cpp b/src/Loggers/OwnPatternFormatter.cpp index 02a2c2e510b..f5ee60a2113 100644 --- a/src/Loggers/OwnPatternFormatter.cpp +++ b/src/Loggers/OwnPatternFormatter.cpp @@ -3,17 +3,19 @@ #include #include #include -#include #include -#include #include +#include +#include -OwnPatternFormatter::OwnPatternFormatter(bool color_) - : Poco::PatternFormatter(""), color(color_) +OwnPatternFormatter::OwnPatternFormatter(bool color_) : Poco::PatternFormatter(""), color(color_) { } +OwnPatternFormatter::OwnPatternFormatter() : Poco::PatternFormatter("") +{ +} void OwnPatternFormatter::formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text) const { diff --git a/src/Loggers/OwnPatternFormatter.h b/src/Loggers/OwnPatternFormatter.h index fba4f0964cb..154068f75fe 100644 --- a/src/Loggers/OwnPatternFormatter.h +++ b/src/Loggers/OwnPatternFormatter.h @@ -25,6 +25,7 @@ class OwnPatternFormatter : public Poco::PatternFormatter { public: OwnPatternFormatter(bool color_ = false); + OwnPatternFormatter(); void format(const Poco::Message & msg, std::string & text) override; void formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text) const; From 116618106574ed71eb9642e1506375fe34de922b Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 Jul 2022 19:59:04 -0700 Subject: [PATCH 037/164] removed unwanted changes from BaseDaemon.cpp --- src/Daemon/BaseDaemon.cpp | 235 +++++++++++++++++++------------------- 1 file changed, 117 insertions(+), 118 deletions(-) diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 81317aa3b6a..af68399c5f4 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -1,69 +1,68 @@ #ifdef HAS_RESERVED_IDENTIFIER -# pragma clang diagnostic ignored "-Wreserved-identifier" +#pragma clang diagnostic ignored "-Wreserved-identifier" #endif #include #include -#include #include -#include #include +#include #include +#include #if defined(OS_LINUX) -# include + #include #endif #include -#include #include +#include #include -#include -#include -#include #include +#include +#include +#include #include -#include -#include #include #include +#include +#include -#include -#include -#include -#include #include +#include +#include +#include +#include -#include -#include -#include #include #include +#include +#include #include -#include -#include -#include #include #include #include +#include +#include +#include #include #include #include -#include -#include +#include #include +#include +#include #include -#include #include #include #if defined(OS_DARWIN) -# pragma GCC diagnostic ignored "-Wunused-macros" +# pragma GCC diagnostic ignored "-Wunused-macros" // NOLINTNEXTLINE(bugprone-reserved-identifier) -# define _XOPEN_SOURCE 700 // ucontext is not available without _XOPEN_SOURCE +# define _XOPEN_SOURCE 700 // ucontext is not available without _XOPEN_SOURCE #endif #include @@ -71,11 +70,11 @@ namespace fs = std::filesystem; namespace DB { -namespace ErrorCodes -{ - extern const int CANNOT_SET_SIGNAL_HANDLER; - extern const int CANNOT_SEND_SIGNAL; -} + namespace ErrorCodes + { + extern const int CANNOT_SET_SIGNAL_HANDLER; + extern const int CANNOT_SEND_SIGNAL; + } } DB::PipeFDs signal_pipe; @@ -93,14 +92,19 @@ static void call_default_signal_handler(int sig) DB::throwFromErrno("Cannot send signal.", DB::ErrorCodes::CANNOT_SEND_SIGNAL); } -static const size_t signal_pipe_buf_size - = sizeof(int) + sizeof(siginfo_t) + sizeof(ucontext_t *) + sizeof(StackTrace) + sizeof(UInt32) + sizeof(void *); +static const size_t signal_pipe_buf_size = + sizeof(int) + + sizeof(siginfo_t) + + sizeof(ucontext_t*) + + sizeof(StackTrace) + + sizeof(UInt32) + + sizeof(void*); -using signal_function = void(int, siginfo_t *, void *); +using signal_function = void(int, siginfo_t*, void*); static void writeSignalIDtoSignalPipe(int sig) { - auto saved_errno = errno; /// We must restore previous value of errno in signal handler. + auto saved_errno = errno; /// We must restore previous value of errno in signal handler. char buf[signal_pipe_buf_size]; DB::WriteBufferFromFileDescriptor out(signal_pipe.fds_rw[1], signal_pipe_buf_size, buf); @@ -129,7 +133,7 @@ static void terminateRequestedSignalHandler(int sig, siginfo_t *, void *) static void signalHandler(int sig, siginfo_t * info, void * context) { DENY_ALLOCATIONS_IN_SCOPE; - auto saved_errno = errno; /// We must restore previous value of errno in signal handler. + auto saved_errno = errno; /// We must restore previous value of errno in signal handler. char buf[signal_pipe_buf_size]; DB::WriteBufferFromFileDescriptorDiscardOnFailure out(signal_pipe.fds_rw[1], signal_pipe_buf_size, buf); @@ -149,7 +153,7 @@ static void signalHandler(int sig, siginfo_t * info, void * context) if (sig != SIGTSTP) /// This signal is used for debugging. { /// The time that is usually enough for separate thread to print info into log. - sleepForSeconds(20); /// FIXME: use some feedback from threads that process stacktrace + sleepForSeconds(20); /// FIXME: use some feedback from threads that process stacktrace call_default_signal_handler(sig); } @@ -158,7 +162,8 @@ static void signalHandler(int sig, siginfo_t * info, void * context) /// Avoid link time dependency on DB/Interpreters - will use this function only when linked. -__attribute__((__weak__)) void collectCrashLog(Int32 signal, UInt64 thread_id, const String & query_id, const StackTrace & stack_trace); +__attribute__((__weak__)) void collectCrashLog( + Int32 signal, UInt64 thread_id, const String & query_id, const StackTrace & stack_trace); /** The thread that read info about signal or std::terminate from pipe. @@ -176,14 +181,16 @@ public: SanitizerTrap = -3, }; - explicit SignalListener(BaseDaemon & daemon_) : log(&Poco::Logger::get("BaseDaemon")), daemon(daemon_) { } + explicit SignalListener(BaseDaemon & daemon_) + : log(&Poco::Logger::get("BaseDaemon")) + , daemon(daemon_) + { + } void run() override { static_assert(PIPE_BUF >= 512); - static_assert( - signal_pipe_buf_size <= PIPE_BUF, - "Only write of PIPE_BUF to pipe is atomic and the minimal known PIPE_BUF across supported platforms is 512"); + static_assert(signal_pipe_buf_size <= PIPE_BUF, "Only write of PIPE_BUF to pipe is atomic and the minimal known PIPE_BUF across supported platforms is 512"); char buf[signal_pipe_buf_size]; DB::ReadBufferFromFileDescriptor in(signal_pipe.fds_rw[0], signal_pipe_buf_size, buf); @@ -218,7 +225,9 @@ public: onTerminate(message, thread_num); } - else if (sig == SIGINT || sig == SIGQUIT || sig == SIGTERM) + else if (sig == SIGINT || + sig == SIGQUIT || + sig == SIGTERM) { daemon.handleSignal(sig); } @@ -255,14 +264,8 @@ private: { size_t pos = message.find('\n'); - LOG_FATAL( - log, - "(version {}{}, {}) (from thread {}) {}", - VERSION_STRING, - VERSION_OFFICIAL, - daemon.build_id_info, - thread_num, - message.substr(0, pos)); + LOG_FATAL(log, "(version {}{}, {}) (from thread {}) {}", + VERSION_STRING, VERSION_OFFICIAL, daemon.build_id_info, thread_num, message.substr(0, pos)); /// Print trace from std::terminate exception line-by-line to make it easy for grep. while (pos != std::string_view::npos) @@ -310,29 +313,15 @@ private: if (query_id.empty()) { - LOG_FATAL( - log, - "(version {}{}, {}) (from thread {}) (no query) Received signal {} ({})", - VERSION_STRING, - VERSION_OFFICIAL, - daemon.build_id_info, - thread_num, - strsignal(sig), - sig); + LOG_FATAL(log, "(version {}{}, {}) (from thread {}) (no query) Received signal {} ({})", + VERSION_STRING, VERSION_OFFICIAL, daemon.build_id_info, + thread_num, strsignal(sig), sig); } else { - LOG_FATAL( - log, - "(version {}{}, {}) (from thread {}) (query_id: {}) (query: {}) Received signal {} ({})", - VERSION_STRING, - VERSION_OFFICIAL, - daemon.build_id_info, - thread_num, - query_id, - query, - strsignal(sig), - sig); + LOG_FATAL(log, "(version {}{}, {}) (from thread {}) (query_id: {}) (query: {}) Received signal {} ({})", + VERSION_STRING, VERSION_OFFICIAL, daemon.build_id_info, + thread_num, query_id, query, strsignal(sig), sig); } String error_message; @@ -367,11 +356,8 @@ private: String calculated_binary_hash = getHashOfLoadedBinaryHex(); if (daemon.stored_binary_hash.empty()) { - LOG_FATAL( - log, - "Integrity check of the executable skipped because the reference checksum could not be read." - " (calculated checksum: {})", - calculated_binary_hash); + LOG_FATAL(log, "Integrity check of the executable skipped because the reference checksum could not be read." + " (calculated checksum: {})", calculated_binary_hash); } else if (calculated_binary_hash == daemon.stored_binary_hash) { @@ -379,18 +365,15 @@ private: } else { - LOG_FATAL( - log, - "Calculated checksum of the executable ({0}) does not correspond" + LOG_FATAL(log, "Calculated checksum of the executable ({0}) does not correspond" " to the reference checksum stored in the executable ({1})." " This may indicate one of the following:" " - the executable was changed just after startup;" " - the executable was corrupted on disk due to faulty hardware;" " - the loaded executable was corrupted in memory due to faulty hardware;" " - the file was intentionally modified;" - " - a logical error in the code.", - calculated_binary_hash, - daemon.stored_binary_hash); + " - a logical error in the code." + , calculated_binary_hash, daemon.stored_binary_hash); } #endif @@ -412,7 +395,12 @@ private: #if defined(SANITIZER) extern "C" void __sanitizer_set_death_callback(void (*)()); -static void sanitizerDeathCallback() +/// Sanitizers may not expect some function calls from death callback. +/// Let's try to disable instrumentation to avoid possible issues. +/// However, this callback may call other functions that are still instrumented. +/// We can try [[clang::always_inline]] attribute for statements in future (available in clang-15) +/// See https://github.com/google/sanitizers/issues/1543 and https://github.com/google/sanitizers/issues/1549. +static DISABLE_SANITIZER_INSTRUMENTATION void sanitizerDeathCallback() { DENY_ALLOCATIONS_IN_SCOPE; /// Also need to send data via pipe. Otherwise it may lead to deadlocks or failures in printing diagnostic info. @@ -620,9 +608,7 @@ void debugIncreaseOOMScore() LOG_INFO(&Poco::Logger::root(), "Set OOM score adjustment to {}", new_score); } #else -void debugIncreaseOOMScore() -{ -} +void debugIncreaseOOMScore() {} #endif } @@ -745,12 +731,14 @@ void BaseDaemon::initialize(Application & self) if (!log_path.empty()) { std::string path = createDirectory(log_path); - if (is_daemon && chdir(path.c_str()) != 0) + if (is_daemon + && chdir(path.c_str()) != 0) throw Poco::Exception("Cannot change directory to " + path); } else { - if (is_daemon && chdir("/tmp") != 0) + if (is_daemon + && chdir("/tmp") != 0) throw Poco::Exception("Cannot change directory to /tmp"); } @@ -897,40 +885,50 @@ void BaseDaemon::initializeTerminationAndSignalProcessing() void BaseDaemon::logRevision() const { - Poco::Logger::root().information( - "Starting " + std::string{VERSION_FULL} + " with revision " + std::to_string(ClickHouseRevision::getVersionRevision()) + ", " - + build_id_info + ", PID " + std::to_string(getpid())); + Poco::Logger::root().information("Starting " + std::string{VERSION_FULL} + + " with revision " + std::to_string(ClickHouseRevision::getVersionRevision()) + + ", " + build_id_info + + ", PID " + std::to_string(getpid())); } void BaseDaemon::defineOptions(Poco::Util::OptionSet & new_options) { - new_options.addOption(Poco::Util::Option("config-file", "C", "load configuration from a given file") - .required(false) - .repeatable(false) - .argument("") - .binding("config-file")); - - new_options.addOption(Poco::Util::Option("log-file", "L", "use given log file") - .required(false) - .repeatable(false) - .argument("") - .binding("logger.log")); - - new_options.addOption(Poco::Util::Option("errorlog-file", "E", "use given log file for errors only") - .required(false) - .repeatable(false) - .argument("") - .binding("logger.errorlog")); + new_options.addOption( + Poco::Util::Option("config-file", "C", "load configuration from a given file") + .required(false) + .repeatable(false) + .argument("") + .binding("config-file")); new_options.addOption( - Poco::Util::Option("pid-file", "P", "use given pidfile").required(false).repeatable(false).argument("").binding("pid")); + Poco::Util::Option("log-file", "L", "use given log file") + .required(false) + .repeatable(false) + .argument("") + .binding("logger.log")); + + new_options.addOption( + Poco::Util::Option("errorlog-file", "E", "use given log file for errors only") + .required(false) + .repeatable(false) + .argument("") + .binding("logger.errorlog")); + + new_options.addOption( + Poco::Util::Option("pid-file", "P", "use given pidfile") + .required(false) + .repeatable(false) + .argument("") + .binding("pid")); Poco::Util::ServerApplication::defineOptions(new_options); } void BaseDaemon::handleSignal(int signal_id) { - if (signal_id == SIGINT || signal_id == SIGQUIT || signal_id == SIGTERM) + if (signal_id == SIGINT || + signal_id == SIGQUIT || + signal_id == SIGTERM) { std::lock_guard lock(signal_handler_mutex); { @@ -964,7 +962,7 @@ void BaseDaemon::waitForTerminationRequest() { /// NOTE: as we already process signals via pipe, we don't have to block them with sigprocmask in threads std::unique_lock lock(signal_handler_mutex); - signal_event.wait(lock, [this]() { return terminate_signals_counter > 0; }); + signal_event.wait(lock, [this](){ return terminate_signals_counter > 0; }); } @@ -1003,7 +1001,7 @@ void BaseDaemon::setupWatchdog() } /// Change short thread name and process name. - setThreadName("clckhouse-watch"); /// 15 characters + setThreadName("clckhouse-watch"); /// 15 characters if (argv0) { @@ -1016,12 +1014,14 @@ void BaseDaemon::setupWatchdog() if (config().getRawString("logger.stream_compress", "false") == "true") { Poco::AutoPtr pf; - if (config().has("logger.json")) + { pf = new OwnJSONPatternFormatter; + } else + { pf = new OwnPatternFormatter(true); - + } Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); logger().setChannel(log); } @@ -1031,7 +1031,8 @@ void BaseDaemon::setupWatchdog() /// Forward signals to the child process. addSignalHandler( {SIGHUP, SIGINT, SIGQUIT, SIGTERM}, - [](int sig, siginfo_t *, void *) { + [](int sig, siginfo_t *, void *) + { /// Forward all signals except INT as it can be send by terminal to the process group when user press Ctrl+C, /// and we process double delivery of this signal as immediate termination. if (sig == SIGINT) @@ -1080,11 +1081,9 @@ void BaseDaemon::setupWatchdog() if (sig == SIGKILL) { - logger().fatal(fmt::format( - "Child process was terminated by signal {} (KILL)." + logger().fatal(fmt::format("Child process was terminated by signal {} (KILL)." " If it is not done by 'forcestop' command or manually," - " the possible cause is OOM Killer (see 'dmesg' and look at the '/var/log/kern.log' for the details).", - sig)); + " the possible cause is OOM Killer (see 'dmesg' and look at the '/var/log/kern.log' for the details).", sig)); } else { From 943affe2da8f4ad55c4de926c699a74b6a42353c Mon Sep 17 00:00:00 2001 From: Mallik Hassan Date: Tue, 26 Jul 2022 03:43:04 -0300 Subject: [PATCH 038/164] Update src/Daemon/BaseDaemon.cpp Co-authored-by: Nikolay Degterinsky <43110995+evillique@users.noreply.github.com> --- src/Daemon/BaseDaemon.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index af68399c5f4..fc6539a739f 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -1015,13 +1015,10 @@ void BaseDaemon::setupWatchdog() { Poco::AutoPtr pf; if (config().has("logger.json")) - { pf = new OwnJSONPatternFormatter; - } else - { pf = new OwnPatternFormatter(true); - } + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); logger().setChannel(log); } From 968e867a0d884df5f3ccd5b07846caae9b087a85 Mon Sep 17 00:00:00 2001 From: Mallik Hassan Date: Tue, 26 Jul 2022 03:44:16 -0300 Subject: [PATCH 039/164] Update src/Loggers/Loggers.cpp Co-authored-by: Nikolay Degterinsky <43110995+evillique@users.noreply.github.com> --- src/Loggers/Loggers.cpp | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index c02363c6017..fc34c978374 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -204,20 +204,16 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log { max_log_level = console_log_level; } + + Poco::AutoPtr pf = new OwnPatternFormatter; if (config.has("logger.json")) - { - Poco::AutoPtr pf = new OwnJSONPatternFormatter(); - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel); - log->setLevel(console_log_level); - split->addChannel(log, "console"); - } + pf = new OwnJSONPatternFormatter; else - { - Poco::AutoPtr pf = new OwnPatternFormatter(color_enabled); - Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel); - log->setLevel(console_log_level); - split->addChannel(log, "console"); - } + pf = new OwnPatternFormatter(true); + + Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel); + log->setLevel(console_log_level); + split->addChannel(log, "console"); } From 9607b21e731e45489b1674f90bb5edcb47c96d37 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 26 Jul 2022 09:00:46 +0000 Subject: [PATCH 040/164] Fix integration test test_timezone_config --- tests/integration/test_timezone_config/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_timezone_config/test.py b/tests/integration/test_timezone_config/test.py index e4a9f75abab..180026c5818 100644 --- a/tests/integration/test_timezone_config/test.py +++ b/tests/integration/test_timezone_config/test.py @@ -26,9 +26,9 @@ def test_overflow_toDate(start_cluster): def test_overflow_toDate32(start_cluster): - assert node.query("SELECT toDate32('2999-12-31','UTC')") == "2283-11-11\n" + assert node.query("SELECT toDate32('2999-12-31','UTC')") == "2299-12-31\n" assert node.query("SELECT toDate32('2021-12-21','UTC')") == "2021-12-21\n" - assert node.query("SELECT toDate32('1000-12-31','UTC')") == "1925-01-01\n" + assert node.query("SELECT toDate32('1000-12-31','UTC')") == "1900-01-01\n" def test_overflow_toDateTime(start_cluster): From ac2a42cfb0f68252d0fb2fcf8b17ec70949c5566 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 26 Jul 2022 10:44:58 +0000 Subject: [PATCH 041/164] Fix test 00189_time_zones_long and integration test test_materialized_mysql_database --- .../materialize_with_ddl.py | 6 +++--- tests/queries/0_stateless/00189_time_zones_long.reference | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index d7ea0c13a93..22d4633685e 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -2126,9 +2126,9 @@ def materialized_database_mysql_date_type_to_date32( mysql_node.query( "CREATE TABLE test_database.a (a INT(11) NOT NULL PRIMARY KEY, b date DEFAULT NULL)" ) - # can't support date that less than 1925 year for now - mysql_node.query("INSERT INTO test_database.a VALUES(1, '1900-04-16')") - # test date that is older than 1925 + # can't support date that less than 1900 year for now + mysql_node.query("INSERT INTO test_database.a VALUES(1, '1899-04-16')") + # test date that is older than 1900 mysql_node.query("INSERT INTO test_database.a VALUES(3, '1971-02-16')") mysql_node.query("INSERT INTO test_database.a VALUES(4, '2101-05-16')") diff --git a/tests/queries/0_stateless/00189_time_zones_long.reference b/tests/queries/0_stateless/00189_time_zones_long.reference index e53ec7ca815..8717a662771 100644 --- a/tests/queries/0_stateless/00189_time_zones_long.reference +++ b/tests/queries/0_stateless/00189_time_zones_long.reference @@ -137,7 +137,7 @@ toStartOfInterval 2015-01-01 2019-01-01 2019-01-01 -2018-10-01 +2018-07-01 2019-02-01 2019-01-01 2018-10-01 @@ -164,7 +164,7 @@ toStartOfInterval 2015-01-01 2019-01-01 2019-01-01 -2018-10-01 +2018-07-01 2019-02-01 2019-01-01 2018-10-01 From f7dcf23404dd777e166da58685b53232c98ed713 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 26 Jul 2022 06:41:52 -0700 Subject: [PATCH 042/164] removed pFormattterJSON pointer from OwnFormattingChannel and reformatted Loggers and BaseDaemon to look nicer --- src/Daemon/BaseDaemon.cpp | 3 +++ src/Loggers/Loggers.cpp | 28 ++++++++++++++++------------ src/Loggers/OwnFormattingChannel.cpp | 14 ++++++++------ src/Loggers/OwnFormattingChannel.h | 7 ------- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index fc6539a739f..2a3b24b6999 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -1018,7 +1018,10 @@ void BaseDaemon::setupWatchdog() pf = new OwnJSONPatternFormatter; else pf = new OwnPatternFormatter(true); +<<<<<<< HEAD +======= +>>>>>>> removed pFormattterJSON pointer from OwnFormattingChannel and reformatted Loggers and BaseDaemon to look nicer Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); logger().setChannel(log); } diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index fc34c978374..6f8c88a7e87 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -1,18 +1,17 @@ #include "Loggers.h" #include -#include -#include -#include #include #include #include "OwnFormattingChannel.h" -#include "OwnJSONPatternFormatter.h" #include "OwnPatternFormatter.h" #include "OwnSplitChannel.h" +#include +#include +#include #ifdef WITH_TEXT_LOG -# include + #include #endif #include @@ -21,9 +20,10 @@ namespace fs = std::filesystem; namespace DB { -class SensitiveDataMasker; + class SensitiveDataMasker; } + // TODO: move to libcommon static std::string createDirectory(const std::string & file) { @@ -49,6 +49,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log if (auto log = text_log.lock()) split->addTextLog(log, text_log_max_priority); #endif + auto current_logger = config.getString("logger", ""); if (config_logger == current_logger) //-V1051 return; @@ -138,6 +139,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log error_log_file->setProperty(Poco::FileChannel::PROP_ROTATEONOPEN, config.getRawString("logger.rotateOnOpen", "false")); Poco::AutoPtr pf; + if (config.has("logger.json")) pf = new OwnJSONPatternFormatter; else @@ -179,6 +181,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log syslog_channel->setProperty(Poco::SyslogChannel::PROP_FACILITY, config.getString("logger.syslog.facility", "LOG_DAEMON")); } syslog_channel->open(); + Poco::AutoPtr pf; if (config.has("logger.json")) @@ -194,7 +197,9 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log bool should_log_to_console = isatty(STDIN_FILENO) || isatty(STDERR_FILENO); bool color_logs_by_default = isatty(STDERR_FILENO); - if (config.getBool("logger.console", false) || (!config.hasProperty("logger.console") && !is_daemon && should_log_to_console)) + + if (config.getBool("logger.console", false) + || (!config.hasProperty("logger.console") && !is_daemon && should_log_to_console)) { bool color_enabled = config.getBool("logger.color_terminal", color_logs_by_default); @@ -205,18 +210,16 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log max_log_level = console_log_level; } - Poco::AutoPtr pf = new OwnPatternFormatter; + Poco::AutoPtr pf; if (config.has("logger.json")) pf = new OwnJSONPatternFormatter; else - pf = new OwnPatternFormatter(true); - + pf = new OwnPatternFormatter(color_enabled); Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel); log->setLevel(console_log_level); split->addChannel(log, "console"); } - split->open(); logger.close(); logger.setChannel(split); @@ -276,7 +279,8 @@ void Loggers::updateLevels(Poco::Util::AbstractConfiguration & config, Poco::Log // Set level to console bool is_daemon = config.getBool("application.runAsDaemon", false); bool should_log_to_console = isatty(STDIN_FILENO) || isatty(STDERR_FILENO); - if (config.getBool("logger.console", false) || (!config.hasProperty("logger.console") && !is_daemon && should_log_to_console)) + if (config.getBool("logger.console", false) + || (!config.hasProperty("logger.console") && !is_daemon && should_log_to_console)) split->setLevel("console", log_level); else split->setLevel("console", 0); diff --git a/src/Loggers/OwnFormattingChannel.cpp b/src/Loggers/OwnFormattingChannel.cpp index a023c28c5ae..d101f0fcbd7 100644 --- a/src/Loggers/OwnFormattingChannel.cpp +++ b/src/Loggers/OwnFormattingChannel.cpp @@ -7,13 +7,15 @@ void OwnFormattingChannel::logExtended(const ExtendedLogMessage & msg) { if (pChannel && priority >= msg.base.getPriority()) { - if (pFormatterJSON || pFormatter) + std::string text; + if (auto * formatter = dynamic_cast(pFormatter.get())) { - std::string text; - if (pFormatterJSON) - pFormatterJSON->formatExtended(msg, text); - else - pFormatter->formatExtended(msg, text); + formatter->formatExtended(msg, text); + pChannel->log(Poco::Message(msg.base, text)); + } + else if(pFormatter) + { + pFormatter->formatExtended(msg, text); pChannel->log(Poco::Message(msg.base, text)); } else diff --git a/src/Loggers/OwnFormattingChannel.h b/src/Loggers/OwnFormattingChannel.h index 12e8b24192d..f8e482f51ba 100644 --- a/src/Loggers/OwnFormattingChannel.h +++ b/src/Loggers/OwnFormattingChannel.h @@ -20,12 +20,6 @@ public: { } - explicit OwnFormattingChannel( - Poco::AutoPtr pFormatterJSON_ = nullptr, Poco::AutoPtr pChannel_ = nullptr) - : pFormatterJSON(std::move(pFormatterJSON_)), pChannel(std::move(pChannel_)), priority(Poco::Message::PRIO_TRACE) - { - } - void setChannel(Poco::AutoPtr pChannel_) { pChannel = std::move(pChannel_); } void setLevel(Poco::Message::Priority priority_) { priority = priority_; } @@ -52,7 +46,6 @@ public: private: Poco::AutoPtr pFormatter; - Poco::AutoPtr pFormatterJSON; Poco::AutoPtr pChannel; std::atomic priority; }; From 0e68ac4e4db680101b8b4f6e03cfd92b7cb3b45b Mon Sep 17 00:00:00 2001 From: Mallik Hassan Date: Tue, 26 Jul 2022 11:34:17 -0300 Subject: [PATCH 043/164] Update BaseDaemon.cpp removed conflict markers from BaseDaemon.cpp --- src/Daemon/BaseDaemon.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 2a3b24b6999..e162360ddaa 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -1018,10 +1018,6 @@ void BaseDaemon::setupWatchdog() pf = new OwnJSONPatternFormatter; else pf = new OwnPatternFormatter(true); -<<<<<<< HEAD - -======= ->>>>>>> removed pFormattterJSON pointer from OwnFormattingChannel and reformatted Loggers and BaseDaemon to look nicer Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); logger().setChannel(log); } From 6406bd998a77eec6f6412f4a52eb84e09a3d733c Mon Sep 17 00:00:00 2001 From: root Date: Tue, 26 Jul 2022 08:01:59 -0700 Subject: [PATCH 044/164] style check correction in OwnFormattingChannel --- src/Loggers/OwnFormattingChannel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Loggers/OwnFormattingChannel.cpp b/src/Loggers/OwnFormattingChannel.cpp index d101f0fcbd7..1487c5ed03b 100644 --- a/src/Loggers/OwnFormattingChannel.cpp +++ b/src/Loggers/OwnFormattingChannel.cpp @@ -13,7 +13,7 @@ void OwnFormattingChannel::logExtended(const ExtendedLogMessage & msg) formatter->formatExtended(msg, text); pChannel->log(Poco::Message(msg.base, text)); } - else if(pFormatter) + else if (pFormatter) { pFormatter->formatExtended(msg, text); pChannel->log(Poco::Message(msg.base, text)); From fa0707b4e788620cf175487a210f4454427efadc Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 26 Jul 2022 17:51:09 +0000 Subject: [PATCH 045/164] Review fixes. --- src/Common/MemoryTracker.cpp | 4 ++-- src/Interpreters/AsynchronousMetrics.cpp | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp index ef5e0c45373..35df2f9e473 100644 --- a/src/Common/MemoryTracker.cpp +++ b/src/Common/MemoryTracker.cpp @@ -204,7 +204,7 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT /// We can't track all memory allocations from external libraries (yet). if (level == VariableContext::Global) { - if (Int64 current_rss = size + rss.fetch_add(size, std::memory_order_relaxed); unlikely(current_rss > will_be)) + if (Int64 current_rss = size + rss.fetch_add(size, std::memory_order_relaxed); current_rss > will_be) { used_rss_counter = true; amount_to_check = current_rss; @@ -306,7 +306,7 @@ void MemoryTracker::free(Int64 size) amount.fetch_sub(size, std::memory_order_relaxed); auto metric_loaded = metric.load(std::memory_order_relaxed); if (metric_loaded != CurrentMetrics::end()) - CurrentMetrics::add(metric_loaded, size); + CurrentMetrics::sub(metric_loaded, size); } /// Since the MemoryTrackerBlockerInThread should respect the level, we should go to the next parent. diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 32c4e421ac3..81fdef3d8a6 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -726,7 +726,6 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } total_memory_tracker.setRSS(rss); - // CurrentMetrics::set(CurrentMetrics::MemoryTracking, new_amount); } } #endif From 1e3fa2e01fa295ee6318ceab880f81f6fff339d1 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 18 Jul 2022 15:53:30 +0000 Subject: [PATCH 046/164] Refactor PreparedSets/SubqueryForSet --- src/Interpreters/ActionsVisitor.cpp | 39 ++++---- src/Interpreters/ActionsVisitor.h | 11 +-- src/Interpreters/ConcurrentHashJoin.cpp | 1 - src/Interpreters/ExpressionAnalyzer.cpp | 27 +++--- src/Interpreters/ExpressionAnalyzer.h | 19 ++-- src/Interpreters/GlobalSubqueriesVisitor.h | 17 ++-- src/Interpreters/InterpreterSelectQuery.cpp | 41 +++------ src/Interpreters/InterpreterSelectQuery.h | 16 ++-- src/Interpreters/MutationsInterpreter.cpp | 7 +- src/Interpreters/PreparedSets.cpp | 87 ++++++++++++++++++ src/Interpreters/PreparedSets.h | 92 +++++++++++-------- src/Interpreters/SubqueryForSet.cpp | 13 --- src/Interpreters/SubqueryForSet.h | 37 -------- src/Processors/QueryPlan/CreatingSetsStep.cpp | 16 ++-- src/Processors/QueryPlan/CreatingSetsStep.h | 4 +- .../Transforms/CreatingSetsTransform.h | 3 +- src/QueryPipeline/QueryPipelineBuilder.h | 1 - src/Storages/MergeTree/KeyCondition.cpp | 15 +-- src/Storages/MergeTree/KeyCondition.h | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 6 +- .../MergeTreeIndexConditionBloomFilter.cpp | 9 +- .../MergeTree/MergeTreeIndexFullText.cpp | 5 +- .../MergeTree/MergeTreeIndexFullText.h | 3 +- .../RocksDB/StorageEmbeddedRocksDB.cpp | 24 ++--- src/Storages/SelectQueryInfo.h | 10 +- src/Storages/VirtualColumnUtils.cpp | 6 +- 26 files changed, 263 insertions(+), 248 deletions(-) create mode 100644 src/Interpreters/PreparedSets.cpp delete mode 100644 src/Interpreters/SubqueryForSet.cpp delete mode 100644 src/Interpreters/SubqueryForSet.h diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index b62690b7a3a..2e95a5f906f 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -401,8 +401,8 @@ SetPtr makeExplicitSet( element_type = low_cardinality_type->getDictionaryType(); auto set_key = PreparedSetKey::forLiteral(*right_arg, set_element_types); - if (auto it = prepared_sets.find(set_key); it != prepared_sets.end()) - return it->second; /// Already prepared. + if (auto set = prepared_sets.getSet(set_key)) + return set; /// Already prepared. Block block; const auto & right_arg_func = std::dynamic_pointer_cast(right_arg); @@ -417,7 +417,7 @@ SetPtr makeExplicitSet( set->insertFromBlock(block.getColumnsWithTypeAndName()); set->finishInsert(); - prepared_sets.emplace(set_key, set); + prepared_sets.setSet(set_key, set); return set; } @@ -484,8 +484,7 @@ ActionsMatcher::Data::Data( size_t subquery_depth_, std::reference_wrapper source_columns_, ActionsDAGPtr actions_dag, - PreparedSets & prepared_sets_, - SubqueriesForSets & subqueries_for_sets_, + PreparedSetsPtr prepared_sets_, bool no_subqueries_, bool no_makeset_, bool only_consts_, @@ -497,7 +496,6 @@ ActionsMatcher::Data::Data( , subquery_depth(subquery_depth_) , source_columns(source_columns_) , prepared_sets(prepared_sets_) - , subqueries_for_sets(subqueries_for_sets_) , no_subqueries(no_subqueries_) , no_makeset(no_makeset_) , only_consts(only_consts_) @@ -1272,6 +1270,9 @@ void ActionsMatcher::visit(const ASTLiteral & literal, const ASTPtr & /* ast */, SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_subqueries) { + if (!data.prepared_sets) + return nullptr; + /** You need to convert the right argument to a set. * This can be a table name, a value, a value enumeration, or a subquery. * The enumeration of values is parsed as a function `tuple`. @@ -1287,8 +1288,8 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su if (no_subqueries) return {}; auto set_key = PreparedSetKey::forSubquery(*right_in_operand); - if (auto it = data.prepared_sets.find(set_key); it != data.prepared_sets.end()) - return it->second; + if (SetPtr set = data.prepared_sets->getSet(set_key)) + return set; /// A special case is if the name of the table is specified on the right side of the IN statement, /// and the table has the type Set (a previously prepared set). @@ -1302,8 +1303,9 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su StorageSet * storage_set = dynamic_cast(table.get()); if (storage_set) { - data.prepared_sets.emplace(set_key, storage_set->getSet()); - return storage_set->getSet(); + SetPtr set = storage_set->getSet(); + data.prepared_sets->setSet(set_key, set); + return set; } } } @@ -1311,16 +1313,10 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su /// We get the stream of blocks for the subquery. Create Set and put it in place of the subquery. String set_id = right_in_operand->getColumnName(); - SubqueryForSet & subquery_for_set = data.subqueries_for_sets[set_id]; + SubqueryForSet & subquery_for_set = data.prepared_sets->createOrGetSubquery(set_id, set_key); - /// If you already created a Set with the same subquery / table. if (subquery_for_set.set) - { - data.prepared_sets.emplace(set_key, subquery_for_set.set); return subquery_for_set.set; - } - - SetPtr set = std::make_shared(data.set_size_limit, false, data.getContext()->getSettingsRef().transform_null_in); /** The following happens for GLOBAL INs or INs: * - in the addExternalStorage function, the IN (SELECT ...) subquery is replaced with IN _data1, @@ -1337,17 +1333,16 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su interpreter->buildQueryPlan(*subquery_for_set.source); } - subquery_for_set.set = set; - data.prepared_sets.emplace(set_key, set); - return set; + subquery_for_set.set = std::make_shared(data.set_size_limit, false, data.getContext()->getSettingsRef().transform_null_in); + return subquery_for_set.set; } else { const auto & last_actions = data.actions_stack.getLastActions(); const auto & index = data.actions_stack.getLastActionsIndex(); - if (index.contains(left_in_operand->getColumnName())) + if (data.prepared_sets && index.contains(left_in_operand->getColumnName())) /// An explicit enumeration of values in parentheses. - return makeExplicitSet(&node, last_actions, false, data.getContext(), data.set_size_limit, data.prepared_sets); + return makeExplicitSet(&node, last_actions, false, data.getContext(), data.set_size_limit, *data.prepared_sets); else return {}; } diff --git a/src/Interpreters/ActionsVisitor.h b/src/Interpreters/ActionsVisitor.h index 5b5a3d31da2..a27745d2cfa 100644 --- a/src/Interpreters/ActionsVisitor.h +++ b/src/Interpreters/ActionsVisitor.h @@ -5,10 +5,9 @@ #include #include #include -#include #include #include - +#include namespace DB { @@ -115,7 +114,7 @@ struct AggregationKeysInfo GroupByKind group_by_kind; }; -/// Collect ExpressionAction from AST. Returns PreparedSets and SubqueriesForSets too. +/// Collect ExpressionAction from AST. Returns PreparedSets class ActionsMatcher { public: @@ -126,8 +125,7 @@ public: SizeLimits set_size_limit; size_t subquery_depth; const NamesAndTypesList & source_columns; - PreparedSets & prepared_sets; - SubqueriesForSets & subqueries_for_sets; + PreparedSetsPtr prepared_sets; bool no_subqueries; bool no_makeset; bool only_consts; @@ -150,8 +148,7 @@ public: size_t subquery_depth_, std::reference_wrapper source_columns_, ActionsDAGPtr actions_dag, - PreparedSets & prepared_sets_, - SubqueriesForSets & subqueries_for_sets_, + PreparedSetsPtr prepared_sets_, bool no_subqueries_, bool no_makeset_, bool only_consts_, diff --git a/src/Interpreters/ConcurrentHashJoin.cpp b/src/Interpreters/ConcurrentHashJoin.cpp index 5d6318a8df1..0d86e1ff8ca 100644 --- a/src/Interpreters/ConcurrentHashJoin.cpp +++ b/src/Interpreters/ConcurrentHashJoin.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 8a14c09819a..f7c6dd46233 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -227,18 +227,16 @@ ExpressionAnalyzer::ExpressionAnalyzer( size_t subquery_depth_, bool do_global, bool is_explain, - SubqueriesForSets subqueries_for_sets_, - PreparedSets prepared_sets_) + PreparedSetsPtr prepared_sets_) : WithContext(context_) , query(query_), settings(getContext()->getSettings()) , subquery_depth(subquery_depth_) , syntax(syntax_analyzer_result_) { /// Cache prepared sets because we might run analysis multiple times - subqueries_for_sets = std::move(subqueries_for_sets_); - prepared_sets = std::move(prepared_sets_); + prepared_sets = prepared_sets_; - /// external_tables, subqueries_for_sets for global subqueries. + /// external_tables, sets for global subqueries. /// Replaces global subqueries with the generated names of temporary tables that will be sent to remote servers. initGlobalSubqueriesAndExternalTables(do_global, is_explain); @@ -502,7 +500,7 @@ void ExpressionAnalyzer::initGlobalSubqueriesAndExternalTables(bool do_global, b if (do_global) { GlobalSubqueriesVisitor::Data subqueries_data( - getContext(), subquery_depth, isRemoteStorage(), is_explain, external_tables, subqueries_for_sets, has_global_subqueries); + getContext(), subquery_depth, isRemoteStorage(), is_explain, external_tables, prepared_sets, has_global_subqueries); GlobalSubqueriesVisitor(subqueries_data).visit(query); } } @@ -510,14 +508,17 @@ void ExpressionAnalyzer::initGlobalSubqueriesAndExternalTables(bool do_global, b void ExpressionAnalyzer::tryMakeSetForIndexFromSubquery(const ASTPtr & subquery_or_table_name, const SelectQueryOptions & query_options) { + if (!prepared_sets) + return; + auto set_key = PreparedSetKey::forSubquery(*subquery_or_table_name); - if (prepared_sets.contains(set_key)) + if (prepared_sets->getSet(set_key)) return; /// Already prepared. if (auto set_ptr_from_storage_set = isPlainStorageSetInSubquery(subquery_or_table_name)) { - prepared_sets.insert({set_key, set_ptr_from_storage_set}); + prepared_sets->setSet(set_key, set_ptr_from_storage_set); return; } @@ -541,7 +542,7 @@ void ExpressionAnalyzer::tryMakeSetForIndexFromSubquery(const ASTPtr & subquery_ set->finishInsert(); - prepared_sets[set_key] = std::move(set); + prepared_sets->setSet(set_key, std::move(set)); } SetPtr ExpressionAnalyzer::isPlainStorageSetInSubquery(const ASTPtr & subquery_or_table_name) @@ -597,8 +598,8 @@ void SelectQueryExpressionAnalyzer::makeSetsForIndex(const ASTPtr & node) auto temp_actions = std::make_shared(columns_after_join); getRootActions(left_in_operand, true, temp_actions); - if (temp_actions->tryFindInIndex(left_in_operand->getColumnName())) - makeExplicitSet(func, *temp_actions, true, getContext(), settings.size_limits_for_set, prepared_sets); + if (prepared_sets && temp_actions->tryFindInIndex(left_in_operand->getColumnName())) + makeExplicitSet(func, *temp_actions, true, getContext(), settings.size_limits_for_set, *prepared_sets); } } } @@ -615,7 +616,6 @@ void ExpressionAnalyzer::getRootActions(const ASTPtr & ast, bool no_makeset_for_ sourceColumns(), std::move(actions), prepared_sets, - subqueries_for_sets, no_makeset_for_subqueries, false /* no_makeset */, only_consts, @@ -635,7 +635,6 @@ void ExpressionAnalyzer::getRootActionsNoMakeSet(const ASTPtr & ast, ActionsDAGP sourceColumns(), std::move(actions), prepared_sets, - subqueries_for_sets, true /* no_makeset_for_subqueries, no_makeset implies no_makeset_for_subqueries */, true /* no_makeset */, only_consts, @@ -657,7 +656,6 @@ void ExpressionAnalyzer::getRootActionsForHaving( sourceColumns(), std::move(actions), prepared_sets, - subqueries_for_sets, no_makeset_for_subqueries, false /* no_makeset */, only_consts, @@ -678,7 +676,6 @@ void ExpressionAnalyzer::getRootActionsForWindowFunctions(const ASTPtr & ast, bo sourceColumns(), std::move(actions), prepared_sets, - subqueries_for_sets, no_makeset_for_subqueries, false /* no_makeset */, false /*only_consts */, diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index da92bc10832..7bc0891bbd5 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -50,8 +49,7 @@ struct ExpressionAnalyzerData { ~ExpressionAnalyzerData(); - SubqueriesForSets subqueries_for_sets; - PreparedSets prepared_sets; + PreparedSetsPtr prepared_sets; std::unique_ptr joined_plan; @@ -106,7 +104,7 @@ public: /// Ctor for non-select queries. Generally its usage is: /// auto actions = ExpressionAnalyzer(query, syntax, context).getActions(); ExpressionAnalyzer(const ASTPtr & query_, const TreeRewriterResultPtr & syntax_analyzer_result_, ContextPtr context_) - : ExpressionAnalyzer(query_, syntax_analyzer_result_, context_, 0, false, false, {}, {}) + : ExpressionAnalyzer(query_, syntax_analyzer_result_, context_, 0, false, false, {}) { } @@ -130,9 +128,7 @@ public: * That is, you need to call getSetsWithSubqueries after all calls of `append*` or `getActions` * and create all the returned sets before performing the actions. */ - SubqueriesForSets & getSubqueriesForSets() { return subqueries_for_sets; } - - PreparedSets & getPreparedSets() { return prepared_sets; } + PreparedSetsPtr getPreparedSets() { return prepared_sets; } /// Get intermediates for tests const ExpressionAnalyzerData & getAnalyzedData() const { return *this; } @@ -164,8 +160,7 @@ protected: size_t subquery_depth_, bool do_global_, bool is_explain_, - SubqueriesForSets subqueries_for_sets_, - PreparedSets prepared_sets_); + PreparedSetsPtr prepared_sets_); ASTPtr query; const ExtractedSettings settings; @@ -317,8 +312,7 @@ public: const NameSet & required_result_columns_ = {}, bool do_global_ = false, const SelectQueryOptions & options_ = {}, - SubqueriesForSets subqueries_for_sets_ = {}, - PreparedSets prepared_sets_ = {}) + PreparedSetsPtr prepared_sets_ = nullptr) : ExpressionAnalyzer( query_, syntax_analyzer_result_, @@ -326,8 +320,7 @@ public: options_.subquery_depth, do_global_, options_.is_explain, - std::move(subqueries_for_sets_), - std::move(prepared_sets_)) + prepared_sets_) , metadata_snapshot(metadata_snapshot_) , required_result_columns(required_result_columns_) , query_options(options_) diff --git a/src/Interpreters/GlobalSubqueriesVisitor.h b/src/Interpreters/GlobalSubqueriesVisitor.h index 7086b1e950d..829beedbafa 100644 --- a/src/Interpreters/GlobalSubqueriesVisitor.h +++ b/src/Interpreters/GlobalSubqueriesVisitor.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -41,7 +41,7 @@ public: bool is_remote; bool is_explain; TemporaryTablesMapping & external_tables; - SubqueriesForSets & subqueries_for_sets; + PreparedSetsPtr prepared_sets; bool & has_global_subqueries; Data( @@ -50,14 +50,14 @@ public: bool is_remote_, bool is_explain_, TemporaryTablesMapping & tables, - SubqueriesForSets & subqueries_for_sets_, + PreparedSetsPtr prepared_sets_, bool & has_global_subqueries_) : WithContext(context_) , subquery_depth(subquery_depth_) , is_remote(is_remote_) , is_explain(is_explain_) , external_tables(tables) - , subqueries_for_sets(subqueries_for_sets_) + , prepared_sets(prepared_sets_) , has_global_subqueries(has_global_subqueries_) { } @@ -178,9 +178,12 @@ public: } else { - subqueries_for_sets[external_table_name].source = std::make_unique(); - interpreter->buildQueryPlan(*subqueries_for_sets[external_table_name].source); - subqueries_for_sets[external_table_name].table = external_storage; + auto set_key = PreparedSetKey::forSubquery(*subquery_or_table_name); + auto & subquery_for_set = prepared_sets->createOrGetSubquery(external_table_name, set_key); + + subquery_for_set.source = std::make_unique(); + interpreter->buildQueryPlan(*subquery_for_set.source); + subquery_for_set.table = external_storage; } /** NOTE If it was written IN tmp_table - the existing temporary (but not external) table, diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index de01115abec..a33c46ee248 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -212,10 +212,9 @@ InterpreterSelectQuery::InterpreterSelectQuery( const ASTPtr & query_ptr_, const ContextPtr & context_, const SelectQueryOptions & options_, - SubqueriesForSets subquery_for_sets_, - PreparedSets prepared_sets_) + PreparedSetsPtr prepared_sets_) : InterpreterSelectQuery( - query_ptr_, context_, std::nullopt, nullptr, options_, {}, {}, std::move(subquery_for_sets_), std::move(prepared_sets_)) + query_ptr_, context_, std::nullopt, nullptr, options_, {}, {}, prepared_sets_) {} InterpreterSelectQuery::~InterpreterSelectQuery() = default; @@ -333,8 +332,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( const SelectQueryOptions & options_, const Names & required_result_column_names, const StorageMetadataPtr & metadata_snapshot_, - SubqueriesForSets subquery_for_sets_, - PreparedSets prepared_sets_) + PreparedSetsPtr prepared_sets_) : InterpreterSelectQuery( query_ptr_, Context::createCopy(context_), @@ -343,8 +341,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( options_, required_result_column_names, metadata_snapshot_, - std::move(subquery_for_sets_), - std::move(prepared_sets_)) + prepared_sets_) {} InterpreterSelectQuery::InterpreterSelectQuery( @@ -355,16 +352,14 @@ InterpreterSelectQuery::InterpreterSelectQuery( const SelectQueryOptions & options_, const Names & required_result_column_names, const StorageMetadataPtr & metadata_snapshot_, - SubqueriesForSets subquery_for_sets_, - PreparedSets prepared_sets_) + PreparedSetsPtr prepared_sets_) /// NOTE: the query almost always should be cloned because it will be modified during analysis. : IInterpreterUnionOrSelectQuery(options_.modify_inplace ? query_ptr_ : query_ptr_->clone(), context_, options_) , storage(storage_) , input_pipe(std::move(input_pipe_)) , log(&Poco::Logger::get("InterpreterSelectQuery")) , metadata_snapshot(metadata_snapshot_) - , subquery_for_sets(std::move(subquery_for_sets_)) - , prepared_sets(std::move(prepared_sets_)) + , prepared_sets(prepared_sets_) { checkStackSize(); @@ -566,8 +561,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( NameSet(required_result_column_names.begin(), required_result_column_names.end()), !options.only_analyze, options, - std::move(subquery_for_sets), - std::move(prepared_sets)); + prepared_sets); if (!options.only_analyze) { @@ -658,8 +652,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( LOG_TRACE(log, "Running 'analyze' second time"); /// Reuse already built sets for multiple passes of analysis - subquery_for_sets = std::move(query_analyzer->getSubqueriesForSets()); - prepared_sets = std::move(query_analyzer->getPreparedSets()); + prepared_sets = query_analyzer->getPreparedSets(); /// Do not try move conditions to PREWHERE for the second time. /// Otherwise, we won't be able to fallback from inefficient PREWHERE to WHERE later. @@ -755,14 +748,9 @@ Block InterpreterSelectQuery::getSampleBlockImpl() auto & query = getSelectQuery(); query_analyzer->makeSetsForIndex(query.where()); query_analyzer->makeSetsForIndex(query.prewhere()); - query_info.sets = std::move(query_analyzer->getPreparedSets()); - query_info.subquery_for_sets = std::move(query_analyzer->getSubqueriesForSets()); + query_info.prepared_sets = query_analyzer->getPreparedSets(); from_stage = storage->getQueryProcessingStage(context, options.to_stage, storage_snapshot, query_info); - - /// query_info.sets is used for further set index analysis. Use copy instead of move. - query_analyzer->getPreparedSets() = query_info.sets; - query_analyzer->getSubqueriesForSets() = std::move(query_info.subquery_for_sets); } /// Do I need to perform the first part of the pipeline? @@ -1174,7 +1162,6 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

getSettingsRef(); auto & expressions = analysis_result; - auto & subqueries_for_sets = query_analyzer->getSubqueriesForSets(); bool intermediate_stage = false; bool to_aggregation_stage = false; bool from_aggregation_stage = false; @@ -1682,8 +1669,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

empty()) + return; + const Settings & settings = context->getSettingsRef(); SizeLimits limits(settings.max_rows_to_transfer, settings.max_bytes_to_transfer, settings.transfer_overflow_mode); - addCreatingSetsStep(query_plan, std::move(subqueries_for_sets), limits, context); + addCreatingSetsStep(query_plan, *prepared_sets, limits, context); } diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index e70490f13ac..0a9ddb28acf 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -77,14 +77,13 @@ public: const StorageMetadataPtr & metadata_snapshot_ = nullptr, const SelectQueryOptions & = {}); - /// Reuse existing subqueries_for_sets and prepared_sets for another pass of analysis. It's used for projection. + /// Reuse existing prepared_sets for another pass of analysis. It's used for projection. /// TODO: Find a general way of sharing sets among different interpreters, such as subqueries. InterpreterSelectQuery( const ASTPtr & query_ptr_, const ContextPtr & context_, const SelectQueryOptions &, - SubqueriesForSets subquery_for_sets_, - PreparedSets prepared_sets_); + PreparedSetsPtr prepared_sets_); ~InterpreterSelectQuery() override; @@ -140,8 +139,7 @@ private: const SelectQueryOptions &, const Names & required_result_column_names = {}, const StorageMetadataPtr & metadata_snapshot_ = nullptr, - SubqueriesForSets subquery_for_sets_ = {}, - PreparedSets prepared_sets_ = {}); + PreparedSetsPtr prepared_sets_ = nullptr); InterpreterSelectQuery( const ASTPtr & query_ptr_, @@ -151,8 +149,7 @@ private: const SelectQueryOptions &, const Names & required_result_column_names = {}, const StorageMetadataPtr & metadata_snapshot_ = nullptr, - SubqueriesForSets subquery_for_sets_ = {}, - PreparedSets prepared_sets_ = {}); + PreparedSetsPtr prepared_sets_ = nullptr); ASTSelectQuery & getSelectQuery() { return query_ptr->as(); } @@ -185,7 +182,7 @@ private: static void executeProjection(QueryPlan & query_plan, const ActionsDAGPtr & expression); void executeDistinct(QueryPlan & query_plan, bool before_order, Names columns, bool pre_distinct); void executeExtremes(QueryPlan & query_plan); - void executeSubqueriesInSetsAndJoins(QueryPlan & query_plan, std::unordered_map & subqueries_for_sets); + void executeSubqueriesInSetsAndJoins(QueryPlan & query_plan); void executeMergeSorted(QueryPlan & query_plan, const SortDescription & sort_description, UInt64 limit, const std::string & description); @@ -244,8 +241,7 @@ private: StorageSnapshotPtr storage_snapshot; /// Reuse already built sets for multiple passes of analysis, possibly across interpreters. - SubqueriesForSets subquery_for_sets; - PreparedSets prepared_sets; + PreparedSetsPtr prepared_sets; }; } diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 8c1d929e409..ed13ec0040d 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -28,6 +28,7 @@ #include #include #include +#include namespace DB @@ -911,13 +912,13 @@ QueryPipelineBuilder MutationsInterpreter::addStreamsForLaterStages(const std::v } } - SubqueriesForSets & subqueries_for_sets = stage.analyzer->getSubqueriesForSets(); - if (!subqueries_for_sets.empty()) + PreparedSetsPtr prepared_sets = stage.analyzer->getPreparedSets(); + if (prepared_sets && !prepared_sets->empty()) { const Settings & settings = context->getSettingsRef(); SizeLimits network_transfer_limits( settings.max_rows_to_transfer, settings.max_bytes_to_transfer, settings.transfer_overflow_mode); - addCreatingSetsStep(plan, std::move(subqueries_for_sets), network_transfer_limits, context); + addCreatingSetsStep(plan, *prepared_sets, network_transfer_limits, context); } } diff --git a/src/Interpreters/PreparedSets.cpp b/src/Interpreters/PreparedSets.cpp new file mode 100644 index 00000000000..5bfbf2b0705 --- /dev/null +++ b/src/Interpreters/PreparedSets.cpp @@ -0,0 +1,87 @@ +#include +#include + +namespace DB +{ + +PreparedSetKey PreparedSetKey::forLiteral(const IAST & ast, DataTypes types_) +{ + /// Remove LowCardinality types from type list because Set doesn't support LowCardinality keys now, + /// just converts LowCardinality to ordinary types. + for (auto & type : types_) + type = recursiveRemoveLowCardinality(type); + + PreparedSetKey key; + key.ast_hash = ast.getTreeHash(); + key.types = std::move(types_); + return key; +} + +PreparedSetKey PreparedSetKey::forSubquery(const IAST & ast) +{ + PreparedSetKey key; + key.ast_hash = ast.getTreeHash(); + return key; +} + +bool PreparedSetKey::operator==(const PreparedSetKey & other) const +{ + if (ast_hash != other.ast_hash) + return false; + + if (types.size() != other.types.size()) + return false; + + for (size_t i = 0; i < types.size(); ++i) + { + if (!types[i]->equals(*other.types[i])) + return false; + } + + return true; +} + +SubqueryForSet & PreparedSets::createOrGetSubquery(const String & subquery_id, const PreparedSetKey & key) +{ + if (auto subqiery_it = subqueries.find(subquery_id); subqiery_it != subqueries.end()) + { + /// If you already created a Set with the same subquery / table for another ast + /// In that case several PreparedSetKey would share same subquery and set + /// Not sure if it's really possible case (maybe for distributed query when set was filled by external table?) + if (subqiery_it->second.set) + sets[key] = subqiery_it->second.set; + return subqiery_it->second; + } + + return subqueries.emplace(subquery_id, sets[key]).first->second; +} + +void PreparedSets::setSet(const PreparedSetKey & key, SetPtr set_) +{ + sets[key] = std::move(set_); +} + +SetPtr & PreparedSets::getSet(const PreparedSetKey & key) +{ + return sets[key]; +} + +PreparedSets::SubqueriesForSets PreparedSets::moveSubqueries() +{ + auto res = std::move(subqueries); + subqueries = SubqueriesForSets(); + return res; +} + +bool PreparedSets::empty() const +{ + return sets.empty(); +} + +SubqueryForSet::SubqueryForSet(SetPtr & set_) : set(set_) {} + +SubqueryForSet::~SubqueryForSet() = default; + +SubqueryForSet::SubqueryForSet(SubqueryForSet &&) noexcept = default; + +}; diff --git a/src/Interpreters/PreparedSets.h b/src/Interpreters/PreparedSets.h index f486752e192..4a209b7c798 100644 --- a/src/Interpreters/PreparedSets.h +++ b/src/Interpreters/PreparedSets.h @@ -5,57 +5,51 @@ #include #include #include - +#include namespace DB { +class QueryPlan; + +class Set; +using SetPtr = std::shared_ptr; + +/// Information on how to build set for the [GLOBAL] IN section. +struct SubqueryForSet +{ + explicit SubqueryForSet(SetPtr & set_); + ~SubqueryForSet(); + + SubqueryForSet(SubqueryForSet &&) noexcept; + SubqueryForSet & operator=(SubqueryForSet &&) noexcept; + + /// The source is obtained using the InterpreterSelectQuery subquery. + std::unique_ptr source; + + /// Build this set from the result of the subquery. + SetPtr & set; + + /// If set, put the result into the table. + /// This is a temporary table for transferring to remote servers for distributed query processing. + StoragePtr table; +}; + struct PreparedSetKey { /// Prepared sets for tuple literals are indexed by the hash of the tree contents and by the desired /// data types of set elements (two different Sets can be required for two tuples with the same contents /// if left hand sides of the IN operators have different types). - static PreparedSetKey forLiteral(const IAST & ast, DataTypes types_) - { - /// Remove LowCardinality types from type list because Set doesn't support LowCardinality keys now, - /// just converts LowCardinality to ordinary types. - for (auto & type : types_) - type = recursiveRemoveLowCardinality(type); - - PreparedSetKey key; - key.ast_hash = ast.getTreeHash(); - key.types = std::move(types_); - return key; - } + static PreparedSetKey forLiteral(const IAST & ast, DataTypes types_); /// Prepared sets for subqueries are indexed only by the AST contents because the type of the resulting /// set is fully determined by the subquery. - static PreparedSetKey forSubquery(const IAST & ast) - { - PreparedSetKey key; - key.ast_hash = ast.getTreeHash(); - return key; - } + static PreparedSetKey forSubquery(const IAST & ast); IAST::Hash ast_hash; DataTypes types; /// Empty for subqueries. - bool operator==(const PreparedSetKey & other) const - { - if (ast_hash != other.ast_hash) - return false; - - if (types.size() != other.types.size()) - return false; - - for (size_t i = 0; i < types.size(); ++i) - { - if (!types[i]->equals(*other.types[i])) - return false; - } - - return true; - } + bool operator==(const PreparedSetKey & other) const; struct Hash { @@ -63,9 +57,31 @@ struct PreparedSetKey }; }; -class Set; -using SetPtr = std::shared_ptr; +class PreparedSets +{ +public: + using SubqueriesForSets = std::unordered_map; -using PreparedSets = std::unordered_map; + SubqueryForSet & createOrGetSubquery(const String & subquery_id, const PreparedSetKey & key); + + void setSet(const PreparedSetKey & key, SetPtr set_); + SetPtr & getSet(const PreparedSetKey & key); + + /// Get subqueries and clear them + SubqueriesForSets moveSubqueries(); + + /// Used in KeyCondition and MergeTreeIndexConditionBloomFilter to make non exact match for types in PreparedSetKey + const std::unordered_map & getSetsMap() const { return sets; } + + bool empty() const; + +private: + std::unordered_map sets; + + /// This is the information required for building sets + SubqueriesForSets subqueries; +}; + +using PreparedSetsPtr = std::shared_ptr; } diff --git a/src/Interpreters/SubqueryForSet.cpp b/src/Interpreters/SubqueryForSet.cpp deleted file mode 100644 index d669e091131..00000000000 --- a/src/Interpreters/SubqueryForSet.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include -#include -#include - -namespace DB -{ - -SubqueryForSet::SubqueryForSet() = default; -SubqueryForSet::~SubqueryForSet() = default; -SubqueryForSet::SubqueryForSet(SubqueryForSet &&) noexcept = default; -SubqueryForSet & SubqueryForSet::operator= (SubqueryForSet &&) noexcept = default; - -} diff --git a/src/Interpreters/SubqueryForSet.h b/src/Interpreters/SubqueryForSet.h deleted file mode 100644 index f737ec4582b..00000000000 --- a/src/Interpreters/SubqueryForSet.h +++ /dev/null @@ -1,37 +0,0 @@ -#pragma once - -#include -#include - - -namespace DB -{ - -class QueryPlan; - -class Set; -using SetPtr = std::shared_ptr; - -/// Information on what to do when executing a subquery in the [GLOBAL] IN/JOIN section. -struct SubqueryForSet -{ - SubqueryForSet(); - ~SubqueryForSet(); - SubqueryForSet(SubqueryForSet &&) noexcept; - SubqueryForSet & operator=(SubqueryForSet &&) noexcept; - - /// The source is obtained using the InterpreterSelectQuery subquery. - std::unique_ptr source; - - /// If set, build it from result. - SetPtr set; - - /// If set, put the result into the table. - /// This is a temporary table for transferring to remote servers for distributed query processing. - StoragePtr table; -}; - -/// ID of subquery -> what to do with it. -using SubqueriesForSets = std::unordered_map; - -} diff --git a/src/Processors/QueryPlan/CreatingSetsStep.cpp b/src/Processors/QueryPlan/CreatingSetsStep.cpp index 94d841ff095..958e34bfd03 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.cpp +++ b/src/Processors/QueryPlan/CreatingSetsStep.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -61,8 +62,6 @@ void CreatingSetStep::describeActions(FormatSettings & settings) const settings.out << prefix; if (subquery_for_set.set) settings.out << "Set: "; - // else if (subquery_for_set.join) - // settings.out << "Join: "; settings.out << description << '\n'; } @@ -71,8 +70,6 @@ void CreatingSetStep::describeActions(JSONBuilder::JSONMap & map) const { if (subquery_for_set.set) map.add("Set", description); - // else if (subquery_for_set.join) - // map.add("Join", description); } @@ -125,7 +122,7 @@ void CreatingSetsStep::describePipeline(FormatSettings & settings) const } void addCreatingSetsStep( - QueryPlan & query_plan, SubqueriesForSets subqueries_for_sets, const SizeLimits & limits, ContextPtr context) + QueryPlan & query_plan, PreparedSets & prepared_sets, const SizeLimits & limits, ContextPtr context) { DataStreams input_streams; input_streams.emplace_back(query_plan.getCurrentDataStream()); @@ -134,17 +131,18 @@ void addCreatingSetsStep( plans.emplace_back(std::make_unique(std::move(query_plan))); query_plan = QueryPlan(); - for (auto & [description, set] : subqueries_for_sets) + for (auto & [description, subquery_for_set] : prepared_sets.moveSubqueries()) { - if (!set.source) + if (!subquery_for_set.source) continue; - auto plan = std::move(set.source); + auto plan = std::move(subquery_for_set.source); + subquery_for_set.source = nullptr; auto creating_set = std::make_unique( plan->getCurrentDataStream(), description, - std::move(set), + std::move(subquery_for_set), limits, context); creating_set->setStepDescription("Create set for subquery"); diff --git a/src/Processors/QueryPlan/CreatingSetsStep.h b/src/Processors/QueryPlan/CreatingSetsStep.h index 20cdd24c8a9..15bc33b5f56 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.h +++ b/src/Processors/QueryPlan/CreatingSetsStep.h @@ -2,8 +2,8 @@ #include #include -#include #include +#include namespace DB { @@ -51,7 +51,7 @@ private: void addCreatingSetsStep( QueryPlan & query_plan, - SubqueriesForSets subqueries_for_sets, + PreparedSets & prepared_sets, const SizeLimits & limits, ContextPtr context); diff --git a/src/Processors/Transforms/CreatingSetsTransform.h b/src/Processors/Transforms/CreatingSetsTransform.h index 48a32ea8663..ca59fb9e220 100644 --- a/src/Processors/Transforms/CreatingSetsTransform.h +++ b/src/Processors/Transforms/CreatingSetsTransform.h @@ -2,10 +2,10 @@ #include #include -#include #include #include #include +#include #include #include @@ -50,7 +50,6 @@ private: Stopwatch watch; bool done_with_set = true; - //bool done_with_join = true; bool done_with_table = true; SizeLimits network_transfer_limits; diff --git a/src/QueryPipeline/QueryPipelineBuilder.h b/src/QueryPipeline/QueryPipelineBuilder.h index 2d9b8028627..9147046cf12 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.h +++ b/src/QueryPipeline/QueryPipelineBuilder.h @@ -21,7 +21,6 @@ class PipelineExecutor; using PipelineExecutorPtr = std::shared_ptr; struct SubqueryForSet; -using SubqueriesForSets = std::unordered_map; struct SizeLimits; diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index daf31698aad..31a3fead389 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -883,14 +883,16 @@ bool KeyCondition::tryPrepareSetIndex( const ASTPtr & right_arg = args[1]; + + if (!prepared_sets) + return false; + SetPtr prepared_set; if (right_arg->as() || right_arg->as()) { - auto set_it = prepared_sets.find(PreparedSetKey::forSubquery(*right_arg)); - if (set_it == prepared_sets.end()) + prepared_set = prepared_sets->getSet(PreparedSetKey::forSubquery(*right_arg)); + if (!prepared_sets) return false; - - prepared_set = set_it->second; } else { @@ -899,8 +901,9 @@ bool KeyCondition::tryPrepareSetIndex( /// and find the one for the right arg based on the AST structure (getTreeHash), after that we check /// that the types it was prepared with are compatible with the types of the primary key. auto set_ast_hash = right_arg->getTreeHash(); + const auto & sets_map = prepared_sets->getSetsMap(); auto set_it = std::find_if( - prepared_sets.begin(), prepared_sets.end(), + sets_map.begin(), sets_map.end(), [&](const auto & candidate_entry) { if (candidate_entry.first.ast_hash != set_ast_hash) @@ -912,7 +915,7 @@ bool KeyCondition::tryPrepareSetIndex( return true; }); - if (set_it == prepared_sets.end()) + if (set_it == sets_map.end()) return false; prepared_set = set_it->second; diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h index af85a90dd62..b5257625f91 100644 --- a/src/Storages/MergeTree/KeyCondition.h +++ b/src/Storages/MergeTree/KeyCondition.h @@ -434,7 +434,7 @@ private: const NameSet key_subexpr_names; NameSet array_joined_columns; - PreparedSets prepared_sets; + PreparedSetsPtr prepared_sets; // If true, always allow key_expr to be wrapped by function bool single_point; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 727ebc9c3cc..3b72a1568c9 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5464,13 +5464,9 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg query_ptr, query_context, query_options, - std::move(query_info.subquery_for_sets), - std::move(query_info.sets)); + query_info.prepared_sets); const auto & analysis_result = select.getAnalysisResult(); - query_info.sets = std::move(select.getQueryAnalyzer()->getPreparedSets()); - query_info.subquery_for_sets = std::move(select.getQueryAnalyzer()->getSubqueriesForSets()); - bool can_use_aggregate_projection = true; /// If the first stage of the query pipeline is more complex than Aggregating - Expression - Filter - ReadFromStorage, /// we cannot use aggregate projection. diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp index 7b194de8103..44b01df099f 100644 --- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp @@ -691,14 +691,13 @@ SetPtr MergeTreeIndexConditionBloomFilter::getPreparedSet(const ASTPtr & node) if (header.has(node->getColumnName())) { const auto & column_and_type = header.getByName(node->getColumnName()); - const auto & prepared_set_it = query_info.sets.find(getPreparedSetKey(node, column_and_type.type)); - - if (prepared_set_it != query_info.sets.end() && prepared_set_it->second->hasExplicitSetElements()) - return prepared_set_it->second; + auto set_key = getPreparedSetKey(node, column_and_type.type); + if (auto prepared_set = query_info.sets->getSet(set_key)) + return prepared_set; } else { - for (const auto & prepared_set_it : query_info.sets) + for (const auto & prepared_set_it : query_info.sets->getSetsMap()) if (prepared_set_it.first.ast_hash == node->getTreeHash() && prepared_set_it.second->hasExplicitSetElements()) return prepared_set_it.second; } diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index b244bd489f1..8902c41876d 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -609,11 +609,10 @@ bool MergeTreeConditionFullText::tryPrepareSetBloomFilter( else set_key = PreparedSetKey::forLiteral(*right_arg, data_types); - auto set_it = prepared_sets.find(set_key); - if (set_it == prepared_sets.end()) + auto prepared_set = prepared_sets->getSet(set_key); + if (!prepared_set) return false; - const SetPtr & prepared_set = set_it->second; if (!prepared_set->hasExplicitSetElements()) return false; diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.h b/src/Storages/MergeTree/MergeTreeIndexFullText.h index 5f5956553dc..bb4f52a463e 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.h +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.h @@ -142,8 +142,9 @@ private: BloomFilterParameters params; TokenExtractorPtr token_extractor; RPN rpn; + /// Sets from syntax analyzer. - PreparedSets prepared_sets; + PreparedSetsPtr prepared_sets; }; class MergeTreeIndexFullText final : public IMergeTreeIndex diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index fd943fbe1c5..da7d94d27a0 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -76,7 +76,7 @@ static RocksDBOptions getOptionsFromConfig(const Poco::Util::AbstractConfigurati // returns keys may be filter by condition static bool traverseASTFilter( - const String & primary_key, const DataTypePtr & primary_key_type, const ASTPtr & elem, const PreparedSets & sets, FieldVectorPtr & res) + const String & primary_key, const DataTypePtr & primary_key_type, const ASTPtr & elem, const PreparedSetsPtr & prepared_sets, FieldVectorPtr & res) { const auto * function = elem->as(); if (!function) @@ -86,7 +86,7 @@ static bool traverseASTFilter( { // one child has the key filter condition is ok for (const auto & child : function->arguments->children) - if (traverseASTFilter(primary_key, primary_key_type, child, sets, res)) + if (traverseASTFilter(primary_key, primary_key_type, child, prepared_sets, res)) return true; return false; } @@ -94,7 +94,7 @@ static bool traverseASTFilter( { // make sure every child has the key filter condition for (const auto & child : function->arguments->children) - if (!traverseASTFilter(primary_key, primary_key_type, child, sets, res)) + if (!traverseASTFilter(primary_key, primary_key_type, child, prepared_sets, res)) return false; return true; } @@ -109,6 +109,9 @@ static bool traverseASTFilter( if (function->name == "in") { + if (!prepared_sets) + return false; + ident = args.children.at(0)->as(); if (!ident) return false; @@ -123,16 +126,15 @@ static bool traverseASTFilter( else set_key = PreparedSetKey::forLiteral(*value, {primary_key_type}); - auto set_it = sets.find(set_key); - if (set_it == sets.end()) - return false; - SetPtr prepared_set = set_it->second; - - if (!prepared_set->hasExplicitSetElements()) + SetPtr set = prepared_sets->getSet(set_key); + if (!set) return false; - prepared_set->checkColumnsNumber(1); - const auto & set_column = *prepared_set->getSetElements()[0]; + if (!set->hasExplicitSetElements()) + return false; + + set->checkColumnsNumber(1); + const auto & set_column = *set->getSetElements()[0]; for (size_t row = 0; row < set_column.size(); ++row) res->push_back(set_column[row]); return true; diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 5046a0b6fe0..8e6009c07bf 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include #include @@ -44,9 +43,6 @@ using ClusterPtr = std::shared_ptr; struct MergeTreeDataSelectAnalysisResult; using MergeTreeDataSelectAnalysisResultPtr = std::shared_ptr; -struct SubqueryForSet; -using SubqueriesForSets = std::unordered_map; - struct PrewhereInfo { /// Actions for row level security filter. Applied separately before prewhere_actions. @@ -166,7 +162,7 @@ struct SelectQueryInfoBase /// Prepared sets are used for indices by storage engine. /// Example: x IN (1, 2, 3) - PreparedSets sets; + PreparedSetsPtr sets; /// Cached value of ExpressionAnalysisResult::has_window bool has_window = false; @@ -189,8 +185,8 @@ struct SelectQueryInfo : SelectQueryInfoBase SelectQueryInfo() = default; SelectQueryInfo(const SelectQueryInfo & other) : SelectQueryInfoBase(other) {} - /// Make subquery_for_sets reusable across different interpreters. - SubqueriesForSets subquery_for_sets; + /// Make sets reusable across different interpreters. + PreparedSetsPtr prepared_sets; }; } diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 40cf650f690..43e31b8e4f4 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -154,14 +155,13 @@ bool prepareFilterBlockWithQuery(const ASTPtr & query, ContextPtr context, Block std::function is_constant = [&block, &context](const ASTPtr & node) { auto actions = std::make_shared(block.getColumnsWithTypeAndName()); - PreparedSets prepared_sets; - SubqueriesForSets subqueries_for_sets; + PreparedSetsPtr prepared_sets = std::make_shared(); const NamesAndTypesList source_columns; const NamesAndTypesList aggregation_keys; const ColumnNumbersList grouping_set_keys; ActionsVisitor::Data visitor_data( - context, SizeLimits{}, 1, source_columns, std::move(actions), prepared_sets, subqueries_for_sets, true, true, true, false, + context, SizeLimits{}, 1, source_columns, std::move(actions), prepared_sets, true, true, true, false, { aggregation_keys, grouping_set_keys, GroupByKind::NONE }); ActionsVisitor(visitor_data).visit(node); actions = visitor_data.getActions(); From d9928ac93d2d811e97af6ca0a817df4f397c8450 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 19 Jul 2022 10:28:21 +0000 Subject: [PATCH 047/164] Add methods to SubqueryForSet, do not use refernce to SetPtr --- src/Interpreters/ActionsVisitor.cpp | 14 ++-- src/Interpreters/GlobalSubqueriesVisitor.h | 8 +- src/Interpreters/InterpreterSelectQuery.h | 2 +- src/Interpreters/PreparedSets.cpp | 73 ++++++++++++------- src/Interpreters/PreparedSets.h | 28 +++++-- src/Processors/QueryPlan/CreatingSetsStep.cpp | 5 +- src/QueryPipeline/QueryPipelineBuilder.h | 2 +- src/Storages/MergeTree/KeyCondition.cpp | 33 +++++---- .../MergeTreeIndexConditionBloomFilter.cpp | 6 +- 9 files changed, 100 insertions(+), 71 deletions(-) diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index 2e95a5f906f..f6299d3d23b 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -1312,11 +1312,9 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su /// We get the stream of blocks for the subquery. Create Set and put it in place of the subquery. String set_id = right_in_operand->getColumnName(); - - SubqueryForSet & subquery_for_set = data.prepared_sets->createOrGetSubquery(set_id, set_key); - - if (subquery_for_set.set) - return subquery_for_set.set; + bool transform_null_in = data.getContext()->getSettingsRef().transform_null_in; + SubqueryForSet & subquery_for_set = data.prepared_sets->createOrGetSubquery( + set_id, set_key, std::make_shared(data.set_size_limit, false, transform_null_in)); /** The following happens for GLOBAL INs or INs: * - in the addExternalStorage function, the IN (SELECT ...) subquery is replaced with IN _data1, @@ -1326,14 +1324,12 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su * In case that we have HAVING with IN subquery, we have to force creating set for it. * Also it doesn't make sense if it is GLOBAL IN or ordinary IN. */ - if (!subquery_for_set.source && data.create_source_for_in) + if (data.create_source_for_in && !subquery_for_set.hasSource()) { auto interpreter = interpretSubquery(right_in_operand, data.getContext(), data.subquery_depth, {}); - subquery_for_set.source = std::make_unique(); - interpreter->buildQueryPlan(*subquery_for_set.source); + subquery_for_set.setSource(*interpreter); } - subquery_for_set.set = std::make_shared(data.set_size_limit, false, data.getContext()->getSettingsRef().transform_null_in); return subquery_for_set.set; } else diff --git a/src/Interpreters/GlobalSubqueriesVisitor.h b/src/Interpreters/GlobalSubqueriesVisitor.h index 829beedbafa..8f5efe77d2f 100644 --- a/src/Interpreters/GlobalSubqueriesVisitor.h +++ b/src/Interpreters/GlobalSubqueriesVisitor.h @@ -178,12 +178,8 @@ public: } else { - auto set_key = PreparedSetKey::forSubquery(*subquery_or_table_name); - auto & subquery_for_set = prepared_sets->createOrGetSubquery(external_table_name, set_key); - - subquery_for_set.source = std::make_unique(); - interpreter->buildQueryPlan(*subquery_for_set.source); - subquery_for_set.table = external_storage; + auto & subquery_for_set = prepared_sets->getSubquery(external_table_name); + subquery_for_set.setSource(*interpreter, external_storage); } /** NOTE If it was written IN tmp_table - the existing temporary (but not external) table, diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index 0a9ddb28acf..f2cdcbba9ed 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -23,7 +23,7 @@ class Logger; namespace DB { -struct SubqueryForSet; +class SubqueryForSet; class InterpreterSelectWithUnionQuery; class Context; class QueryPlan; diff --git a/src/Interpreters/PreparedSets.cpp b/src/Interpreters/PreparedSets.cpp index 5bfbf2b0705..f5cb4aa68b4 100644 --- a/src/Interpreters/PreparedSets.cpp +++ b/src/Interpreters/PreparedSets.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace DB { @@ -41,29 +42,35 @@ bool PreparedSetKey::operator==(const PreparedSetKey & other) const return true; } -SubqueryForSet & PreparedSets::createOrGetSubquery(const String & subquery_id, const PreparedSetKey & key) +SubqueryForSet & PreparedSets::createOrGetSubquery(const String & subquery_id, const PreparedSetKey & key, SetPtr set_) { - if (auto subqiery_it = subqueries.find(subquery_id); subqiery_it != subqueries.end()) + SubqueryForSet & subquery = subqueries[subquery_id]; + + /// If you already created a Set with the same subquery / table for another ast + /// In that case several PreparedSetKey would share same subquery and set + /// Not sure if it's really possible case (maybe for distributed query when set was filled by external table?) + if (subquery.set) + sets[key] = subquery.set; + else + sets[key] = subquery.set = set_; + return subquery; +} + +SubqueryForSet & PreparedSets::getSubquery(const String & subquery_id) { return subqueries[subquery_id]; } + +void PreparedSets::setSet(const PreparedSetKey & key, SetPtr set_) { sets[key] = set_; } + +SetPtr & PreparedSets::getSet(const PreparedSetKey & key) { return sets[key]; } + +std::vector PreparedSets::getByTreeHash(IAST::Hash ast_hash) +{ + std::vector res; + for (const auto & it : this->sets) { - /// If you already created a Set with the same subquery / table for another ast - /// In that case several PreparedSetKey would share same subquery and set - /// Not sure if it's really possible case (maybe for distributed query when set was filled by external table?) - if (subqiery_it->second.set) - sets[key] = subqiery_it->second.set; - return subqiery_it->second; + if (it.first.ast_hash == ast_hash) + res.push_back(it.second); } - - return subqueries.emplace(subquery_id, sets[key]).first->second; -} - -void PreparedSets::setSet(const PreparedSetKey & key, SetPtr set_) -{ - sets[key] = std::move(set_); -} - -SetPtr & PreparedSets::getSet(const PreparedSetKey & key) -{ - return sets[key]; + return res; } PreparedSets::SubqueriesForSets PreparedSets::moveSubqueries() @@ -73,15 +80,31 @@ PreparedSets::SubqueriesForSets PreparedSets::moveSubqueries() return res; } -bool PreparedSets::empty() const -{ - return sets.empty(); -} +bool PreparedSets::empty() const { return sets.empty(); } -SubqueryForSet::SubqueryForSet(SetPtr & set_) : set(set_) {} +SubqueryForSet::SubqueryForSet() = default; SubqueryForSet::~SubqueryForSet() = default; SubqueryForSet::SubqueryForSet(SubqueryForSet &&) noexcept = default; +void SubqueryForSet::setSource(InterpreterSelectWithUnionQuery & interpreter, StoragePtr table_) +{ + source = std::make_unique(); + interpreter.buildQueryPlan(*source); + table = table_; +} + +bool SubqueryForSet::hasSource() const +{ + return source != nullptr; +} + +QueryPlanPtr SubqueryForSet::moveSource() +{ + auto res = std::move(source); + source = nullptr; + return res; +} + }; diff --git a/src/Interpreters/PreparedSets.h b/src/Interpreters/PreparedSets.h index 4a209b7c798..b8a059830a8 100644 --- a/src/Interpreters/PreparedSets.h +++ b/src/Interpreters/PreparedSets.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -14,25 +15,36 @@ class QueryPlan; class Set; using SetPtr = std::shared_ptr; +class InterpreterSelectWithUnionQuery; /// Information on how to build set for the [GLOBAL] IN section. -struct SubqueryForSet +class SubqueryForSet { - explicit SubqueryForSet(SetPtr & set_); +public: + SubqueryForSet(); ~SubqueryForSet(); SubqueryForSet(SubqueryForSet &&) noexcept; SubqueryForSet & operator=(SubqueryForSet &&) noexcept; - /// The source is obtained using the InterpreterSelectQuery subquery. - std::unique_ptr source; + void setSource(InterpreterSelectWithUnionQuery & interpreter, StoragePtr table_ = nullptr); + + bool hasSource() const; + + /// Returns query plan for the source of the set + /// It would be removed from SubqueryForSet + std::unique_ptr moveSource(); /// Build this set from the result of the subquery. - SetPtr & set; + SetPtr set; /// If set, put the result into the table. /// This is a temporary table for transferring to remote servers for distributed query processing. StoragePtr table; + +private: + /// The source is obtained using the InterpreterSelectQuery subquery. + std::unique_ptr source; }; struct PreparedSetKey @@ -62,7 +74,8 @@ class PreparedSets public: using SubqueriesForSets = std::unordered_map; - SubqueryForSet & createOrGetSubquery(const String & subquery_id, const PreparedSetKey & key); + SubqueryForSet & createOrGetSubquery(const String & subquery_id, const PreparedSetKey & key, SetPtr set); + SubqueryForSet & getSubquery(const String & subquery_id); void setSet(const PreparedSetKey & key, SetPtr set_); SetPtr & getSet(const PreparedSetKey & key); @@ -70,8 +83,9 @@ public: /// Get subqueries and clear them SubqueriesForSets moveSubqueries(); + /// Returns all sets that match the given ast hash not checking types /// Used in KeyCondition and MergeTreeIndexConditionBloomFilter to make non exact match for types in PreparedSetKey - const std::unordered_map & getSetsMap() const { return sets; } + std::vector getByTreeHash(IAST::Hash ast_hash); bool empty() const; diff --git a/src/Processors/QueryPlan/CreatingSetsStep.cpp b/src/Processors/QueryPlan/CreatingSetsStep.cpp index 958e34bfd03..97ec3393a7d 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.cpp +++ b/src/Processors/QueryPlan/CreatingSetsStep.cpp @@ -133,11 +133,10 @@ void addCreatingSetsStep( for (auto & [description, subquery_for_set] : prepared_sets.moveSubqueries()) { - if (!subquery_for_set.source) + if (!subquery_for_set.hasSource()) continue; - auto plan = std::move(subquery_for_set.source); - subquery_for_set.source = nullptr; + auto plan = subquery_for_set.moveSource(); auto creating_set = std::make_unique( plan->getCurrentDataStream(), diff --git a/src/QueryPipeline/QueryPipelineBuilder.h b/src/QueryPipeline/QueryPipelineBuilder.h index 9147046cf12..bd6246d41ce 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.h +++ b/src/QueryPipeline/QueryPipelineBuilder.h @@ -20,7 +20,7 @@ class QueryPlan; class PipelineExecutor; using PipelineExecutorPtr = std::shared_ptr; -struct SubqueryForSet; +class SubqueryForSet; struct SizeLimits; diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 31a3fead389..80b5f4282c9 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -900,25 +900,26 @@ bool KeyCondition::tryPrepareSetIndex( /// about types in left argument of the IN operator. Instead, we manually iterate through all the sets /// and find the one for the right arg based on the AST structure (getTreeHash), after that we check /// that the types it was prepared with are compatible with the types of the primary key. - auto set_ast_hash = right_arg->getTreeHash(); - const auto & sets_map = prepared_sets->getSetsMap(); - auto set_it = std::find_if( - sets_map.begin(), sets_map.end(), - [&](const auto & candidate_entry) - { - if (candidate_entry.first.ast_hash != set_ast_hash) + + auto types_match = [&indexes_mapping, &data_types](const SetPtr & candidate_set) + { + assert(indexes_mapping.size() == data_types.size()); + + for (size_t i = 0; i < indexes_mapping.size(); ++i) + if (!candidate_set->areTypesEqual(indexes_mapping[i].tuple_index, data_types[i])) return false; - for (size_t i = 0; i < indexes_mapping.size(); ++i) - if (!candidate_entry.second->areTypesEqual(indexes_mapping[i].tuple_index, data_types[i])) - return false; + return true; + }; - return true; - }); - if (set_it == sets_map.end()) - return false; - - prepared_set = set_it->second; + for (const auto & set : prepared_sets->getByTreeHash(right_arg->getTreeHash())) + { + if (types_match(set)) + { + prepared_set = set; + break; + } + } } /// The index can be prepared if the elements of the set were saved in advance. diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp index 44b01df099f..f86255dc510 100644 --- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp @@ -697,9 +697,9 @@ SetPtr MergeTreeIndexConditionBloomFilter::getPreparedSet(const ASTPtr & node) } else { - for (const auto & prepared_set_it : query_info.sets->getSetsMap()) - if (prepared_set_it.first.ast_hash == node->getTreeHash() && prepared_set_it.second->hasExplicitSetElements()) - return prepared_set_it.second; + for (const auto & set : query_info.sets->getByTreeHash(node->getTreeHash())) + if (set->hasExplicitSetElements()) + return set; } return DB::SetPtr(); From 11d37a8dc9d2412e67046330bbf728a6cda07e79 Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 20 Jul 2022 14:44:26 +0000 Subject: [PATCH 048/164] Properly initialize prepared_sets --- src/Interpreters/ExpressionAnalyzer.cpp | 5 ++++- src/Interpreters/InterpreterSelectQuery.cpp | 3 +++ src/Interpreters/PreparedSets.cpp | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index f7c6dd46233..b6a8456003e 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -234,7 +234,10 @@ ExpressionAnalyzer::ExpressionAnalyzer( , syntax(syntax_analyzer_result_) { /// Cache prepared sets because we might run analysis multiple times - prepared_sets = prepared_sets_; + if (prepared_sets_) + prepared_sets = prepared_sets_; + else + prepared_sets = std::make_shared(); /// external_tables, sets for global subqueries. /// Replaces global subqueries with the generated names of temporary tables that will be sent to remote servers. diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index a33c46ee248..3e322a62549 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -363,6 +363,9 @@ InterpreterSelectQuery::InterpreterSelectQuery( { checkStackSize(); + if (!prepared_sets) + prepared_sets = std::make_shared(); + query_info.ignore_projections = options.ignore_projections; query_info.is_projection_query = options.is_projection_query; diff --git a/src/Interpreters/PreparedSets.cpp b/src/Interpreters/PreparedSets.cpp index f5cb4aa68b4..cb85ef15f0f 100644 --- a/src/Interpreters/PreparedSets.cpp +++ b/src/Interpreters/PreparedSets.cpp @@ -92,7 +92,8 @@ void SubqueryForSet::setSource(InterpreterSelectWithUnionQuery & interpreter, St { source = std::make_unique(); interpreter.buildQueryPlan(*source); - table = table_; + if (table_) + table = table_; } bool SubqueryForSet::hasSource() const From 5ce2960f03cbea91ffcb9320e5c35d4abdf667dd Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 20 Jul 2022 14:45:07 +0000 Subject: [PATCH 049/164] Get rid of SelectQueryInfoBase -> SelectQueryInfo --- src/Interpreters/InterpreterSelectQuery.cpp | 2 +- src/Storages/MergeTree/KeyCondition.cpp | 2 +- .../MergeTreeIndexConditionBloomFilter.cpp | 4 ++-- .../MergeTree/MergeTreeIndexFullText.cpp | 2 +- .../RocksDB/StorageEmbeddedRocksDB.cpp | 2 +- src/Storages/SelectQueryInfo.h | 19 +++++++------------ 6 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 3e322a62549..1e54d75b5d5 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2015,7 +2015,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc SelectQueryInfo temp_query_info; temp_query_info.query = query_ptr; temp_query_info.syntax_analyzer_result = syntax_analyzer_result; - temp_query_info.sets = query_analyzer->getPreparedSets(); + temp_query_info.prepared_sets = query_analyzer->getPreparedSets(); num_rows = storage->totalRowsByPartitionPredicate(temp_query_info, context); } diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 80b5f4282c9..83e7eee263e 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -442,7 +442,7 @@ KeyCondition::KeyCondition( bool strict_) : key_expr(key_expr_) , key_subexpr_names(getAllSubexpressionNames(*key_expr)) - , prepared_sets(query_info.sets) + , prepared_sets(query_info.prepared_sets) , single_point(single_point_) , strict(strict_) { diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp index f86255dc510..54b88c623f5 100644 --- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp @@ -692,12 +692,12 @@ SetPtr MergeTreeIndexConditionBloomFilter::getPreparedSet(const ASTPtr & node) { const auto & column_and_type = header.getByName(node->getColumnName()); auto set_key = getPreparedSetKey(node, column_and_type.type); - if (auto prepared_set = query_info.sets->getSet(set_key)) + if (auto prepared_set = query_info.prepared_sets->getSet(set_key)) return prepared_set; } else { - for (const auto & set : query_info.sets->getByTreeHash(node->getTreeHash())) + for (const auto & set : query_info.prepared_sets->getByTreeHash(node->getTreeHash())) if (set->hasExplicitSetElements()) return set; } diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 8902c41876d..8cf4b615e56 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -146,7 +146,7 @@ MergeTreeConditionFullText::MergeTreeConditionFullText( , index_data_types(index_sample_block.getNamesAndTypesList().getTypes()) , params(params_) , token_extractor(token_extactor_) - , prepared_sets(query_info.sets) + , prepared_sets(query_info.prepared_sets) { rpn = std::move( RPNBuilder( diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index da7d94d27a0..05496f817f3 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -175,7 +175,7 @@ static std::pair getFilterKeys( return {{}, true}; FieldVectorPtr res = std::make_shared(); - auto matched_keys = traverseASTFilter(primary_key, primary_key_type, select.where(), query_info.sets, res); + auto matched_keys = traverseASTFilter(primary_key, primary_key_type, select.where(), query_info.prepared_sets, res); return std::make_pair(res, !matched_keys); } diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 8e6009c07bf..143d9a1b616 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -132,8 +132,13 @@ struct ProjectionCandidate * that can be used during query processing * inside storage engines. */ -struct SelectQueryInfoBase +struct SelectQueryInfo { + + SelectQueryInfo() + : prepared_sets(std::make_shared()) + {} + ASTPtr query; ASTPtr view_query; /// Optimized VIEW query ASTPtr original_query; /// Unmodified query for projection analysis @@ -162,7 +167,7 @@ struct SelectQueryInfoBase /// Prepared sets are used for indices by storage engine. /// Example: x IN (1, 2, 3) - PreparedSetsPtr sets; + PreparedSetsPtr prepared_sets; /// Cached value of ExpressionAnalysisResult::has_window bool has_window = false; @@ -179,14 +184,4 @@ struct SelectQueryInfoBase MergeTreeDataSelectAnalysisResultPtr merge_tree_select_result_ptr; }; -/// Contains non-copyable stuff -struct SelectQueryInfo : SelectQueryInfoBase -{ - SelectQueryInfo() = default; - SelectQueryInfo(const SelectQueryInfo & other) : SelectQueryInfoBase(other) {} - - /// Make sets reusable across different interpreters. - PreparedSetsPtr prepared_sets; -}; - } From 8eecb9ef8236d6b87f67628b5fd21f1f573320cc Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 27 Jul 2022 11:22:16 +0000 Subject: [PATCH 050/164] upd PreparedSets: rename/change signature of methods, add comments --- src/Interpreters/ActionsVisitor.cpp | 13 ++++----- src/Interpreters/ExpressionAnalyzer.cpp | 6 ++-- src/Interpreters/GlobalSubqueriesVisitor.h | 2 +- src/Interpreters/InterpreterSelectQuery.cpp | 8 +----- src/Interpreters/MutationsInterpreter.cpp | 9 +----- src/Interpreters/PreparedSets.cpp | 24 ++++++++-------- src/Interpreters/PreparedSets.h | 28 +++++++++---------- src/Processors/QueryPlan/CreatingSetsStep.cpp | 14 ++++++---- src/Processors/QueryPlan/CreatingSetsStep.h | 6 +--- src/Storages/MergeTree/KeyCondition.cpp | 2 +- .../MergeTreeIndexConditionBloomFilter.cpp | 2 +- .../MergeTree/MergeTreeIndexFullText.cpp | 2 +- .../RocksDB/StorageEmbeddedRocksDB.cpp | 2 +- 13 files changed, 51 insertions(+), 67 deletions(-) diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index f6299d3d23b..e545347b3ae 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -401,7 +401,7 @@ SetPtr makeExplicitSet( element_type = low_cardinality_type->getDictionaryType(); auto set_key = PreparedSetKey::forLiteral(*right_arg, set_element_types); - if (auto set = prepared_sets.getSet(set_key)) + if (auto set = prepared_sets.get(set_key)) return set; /// Already prepared. Block block; @@ -417,7 +417,7 @@ SetPtr makeExplicitSet( set->insertFromBlock(block.getColumnsWithTypeAndName()); set->finishInsert(); - prepared_sets.setSet(set_key, set); + prepared_sets.set(set_key, set); return set; } @@ -1288,7 +1288,7 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su if (no_subqueries) return {}; auto set_key = PreparedSetKey::forSubquery(*right_in_operand); - if (SetPtr set = data.prepared_sets->getSet(set_key)) + if (SetPtr set = data.prepared_sets->get(set_key)) return set; /// A special case is if the name of the table is specified on the right side of the IN statement, @@ -1304,7 +1304,7 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su if (storage_set) { SetPtr set = storage_set->getSet(); - data.prepared_sets->setSet(set_key, set); + data.prepared_sets->set(set_key, set); return set; } } @@ -1313,8 +1313,7 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su /// We get the stream of blocks for the subquery. Create Set and put it in place of the subquery. String set_id = right_in_operand->getColumnName(); bool transform_null_in = data.getContext()->getSettingsRef().transform_null_in; - SubqueryForSet & subquery_for_set = data.prepared_sets->createOrGetSubquery( - set_id, set_key, std::make_shared(data.set_size_limit, false, transform_null_in)); + SubqueryForSet & subquery_for_set = data.prepared_sets->createOrGetSubquery(set_id, set_key, data.set_size_limit, transform_null_in); /** The following happens for GLOBAL INs or INs: * - in the addExternalStorage function, the IN (SELECT ...) subquery is replaced with IN _data1, @@ -1327,7 +1326,7 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su if (data.create_source_for_in && !subquery_for_set.hasSource()) { auto interpreter = interpretSubquery(right_in_operand, data.getContext(), data.subquery_depth, {}); - subquery_for_set.setSource(*interpreter); + subquery_for_set.createSource(*interpreter); } return subquery_for_set.set; diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index b6a8456003e..d4ca51ea107 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -516,12 +516,12 @@ void ExpressionAnalyzer::tryMakeSetForIndexFromSubquery(const ASTPtr & subquery_ auto set_key = PreparedSetKey::forSubquery(*subquery_or_table_name); - if (prepared_sets->getSet(set_key)) + if (prepared_sets->get(set_key)) return; /// Already prepared. if (auto set_ptr_from_storage_set = isPlainStorageSetInSubquery(subquery_or_table_name)) { - prepared_sets->setSet(set_key, set_ptr_from_storage_set); + prepared_sets->set(set_key, set_ptr_from_storage_set); return; } @@ -545,7 +545,7 @@ void ExpressionAnalyzer::tryMakeSetForIndexFromSubquery(const ASTPtr & subquery_ set->finishInsert(); - prepared_sets->setSet(set_key, std::move(set)); + prepared_sets->set(set_key, std::move(set)); } SetPtr ExpressionAnalyzer::isPlainStorageSetInSubquery(const ASTPtr & subquery_or_table_name) diff --git a/src/Interpreters/GlobalSubqueriesVisitor.h b/src/Interpreters/GlobalSubqueriesVisitor.h index 8f5efe77d2f..b62fe817b84 100644 --- a/src/Interpreters/GlobalSubqueriesVisitor.h +++ b/src/Interpreters/GlobalSubqueriesVisitor.h @@ -179,7 +179,7 @@ public: else { auto & subquery_for_set = prepared_sets->getSubquery(external_table_name); - subquery_for_set.setSource(*interpreter, external_storage); + subquery_for_set.createSource(*interpreter, external_storage); } /** NOTE If it was written IN tmp_table - the existing temporary (but not external) table, diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 1e54d75b5d5..25a7a5f89fc 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2829,13 +2829,7 @@ void InterpreterSelectQuery::executeExtremes(QueryPlan & query_plan) void InterpreterSelectQuery::executeSubqueriesInSetsAndJoins(QueryPlan & query_plan) { - if (!prepared_sets || prepared_sets->empty()) - return; - - const Settings & settings = context->getSettingsRef(); - - SizeLimits limits(settings.max_rows_to_transfer, settings.max_bytes_to_transfer, settings.transfer_overflow_mode); - addCreatingSetsStep(query_plan, *prepared_sets, limits, context); + addCreatingSetsStep(query_plan, prepared_sets, context); } diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index ed13ec0040d..e66ac9a5838 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -912,14 +912,7 @@ QueryPipelineBuilder MutationsInterpreter::addStreamsForLaterStages(const std::v } } - PreparedSetsPtr prepared_sets = stage.analyzer->getPreparedSets(); - if (prepared_sets && !prepared_sets->empty()) - { - const Settings & settings = context->getSettingsRef(); - SizeLimits network_transfer_limits( - settings.max_rows_to_transfer, settings.max_bytes_to_transfer, settings.transfer_overflow_mode); - addCreatingSetsStep(plan, *prepared_sets, network_transfer_limits, context); - } + addCreatingSetsStep(plan, stage.analyzer->getPreparedSets(), context); } QueryPlanOptimizationSettings do_not_optimize_plan; diff --git a/src/Interpreters/PreparedSets.cpp b/src/Interpreters/PreparedSets.cpp index cb85ef15f0f..79cfb8b688a 100644 --- a/src/Interpreters/PreparedSets.cpp +++ b/src/Interpreters/PreparedSets.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace DB { @@ -42,7 +43,8 @@ bool PreparedSetKey::operator==(const PreparedSetKey & other) const return true; } -SubqueryForSet & PreparedSets::createOrGetSubquery(const String & subquery_id, const PreparedSetKey & key, SetPtr set_) +SubqueryForSet & PreparedSets::createOrGetSubquery(const String & subquery_id, const PreparedSetKey & key, + SizeLimits set_size_limit, bool transform_null_in) { SubqueryForSet & subquery = subqueries[subquery_id]; @@ -52,15 +54,17 @@ SubqueryForSet & PreparedSets::createOrGetSubquery(const String & subquery_id, c if (subquery.set) sets[key] = subquery.set; else - sets[key] = subquery.set = set_; + sets[key] = subquery.set = std::make_shared(set_size_limit, false, transform_null_in); return subquery; } +/// If the subquery is not associated with any set, create default-constructed SubqueryForSet. +/// It's aimed to fill external table passed to SubqueryForSet::createSource. SubqueryForSet & PreparedSets::getSubquery(const String & subquery_id) { return subqueries[subquery_id]; } -void PreparedSets::setSet(const PreparedSetKey & key, SetPtr set_) { sets[key] = set_; } +void PreparedSets::set(const PreparedSetKey & key, SetPtr set_) { sets[key] = set_; } -SetPtr & PreparedSets::getSet(const PreparedSetKey & key) { return sets[key]; } +SetPtr & PreparedSets::get(const PreparedSetKey & key) { return sets[key]; } std::vector PreparedSets::getByTreeHash(IAST::Hash ast_hash) { @@ -73,7 +77,7 @@ std::vector PreparedSets::getByTreeHash(IAST::Hash ast_hash) return res; } -PreparedSets::SubqueriesForSets PreparedSets::moveSubqueries() +PreparedSets::SubqueriesForSets PreparedSets::detachSubqueries() { auto res = std::move(subqueries); subqueries = SubqueriesForSets(); @@ -82,13 +86,7 @@ PreparedSets::SubqueriesForSets PreparedSets::moveSubqueries() bool PreparedSets::empty() const { return sets.empty(); } -SubqueryForSet::SubqueryForSet() = default; - -SubqueryForSet::~SubqueryForSet() = default; - -SubqueryForSet::SubqueryForSet(SubqueryForSet &&) noexcept = default; - -void SubqueryForSet::setSource(InterpreterSelectWithUnionQuery & interpreter, StoragePtr table_) +void SubqueryForSet::createSource(InterpreterSelectWithUnionQuery & interpreter, StoragePtr table_) { source = std::make_unique(); interpreter.buildQueryPlan(*source); @@ -101,7 +99,7 @@ bool SubqueryForSet::hasSource() const return source != nullptr; } -QueryPlanPtr SubqueryForSet::moveSource() +QueryPlanPtr SubqueryForSet::detachSource() { auto res = std::move(source); source = nullptr; diff --git a/src/Interpreters/PreparedSets.h b/src/Interpreters/PreparedSets.h index b8a059830a8..06600c49f13 100644 --- a/src/Interpreters/PreparedSets.h +++ b/src/Interpreters/PreparedSets.h @@ -7,6 +7,8 @@ #include #include #include +#include +#include namespace DB { @@ -21,19 +23,14 @@ class InterpreterSelectWithUnionQuery; class SubqueryForSet { public: - SubqueryForSet(); - ~SubqueryForSet(); - SubqueryForSet(SubqueryForSet &&) noexcept; - SubqueryForSet & operator=(SubqueryForSet &&) noexcept; - - void setSource(InterpreterSelectWithUnionQuery & interpreter, StoragePtr table_ = nullptr); + void createSource(InterpreterSelectWithUnionQuery & interpreter, StoragePtr table_ = nullptr); bool hasSource() const; - /// Returns query plan for the source of the set - /// It would be removed from SubqueryForSet - std::unique_ptr moveSource(); + /// Returns query plan for the set's source + /// and removes it from SubqueryForSet because we need to build it only once. + std::unique_ptr detachSource(); /// Build this set from the result of the subquery. SetPtr set; @@ -74,14 +71,17 @@ class PreparedSets public: using SubqueriesForSets = std::unordered_map; - SubqueryForSet & createOrGetSubquery(const String & subquery_id, const PreparedSetKey & key, SetPtr set); + SubqueryForSet & createOrGetSubquery(const String & subquery_id, const PreparedSetKey & key, + SizeLimits set_size_limit, bool transform_null_in); SubqueryForSet & getSubquery(const String & subquery_id); - void setSet(const PreparedSetKey & key, SetPtr set_); - SetPtr & getSet(const PreparedSetKey & key); + void set(const PreparedSetKey & key, SetPtr set_); + SetPtr & get(const PreparedSetKey & key); - /// Get subqueries and clear them - SubqueriesForSets moveSubqueries(); + /// Get subqueries and clear them. + /// We need to build a plan for subqueries just once. That's why we can clear them after accessing them. + /// SetPtr would still be available for consumers of PreparedSets. + SubqueriesForSets detachSubqueries(); /// Returns all sets that match the given ast hash not checking types /// Used in KeyCondition and MergeTreeIndexConditionBloomFilter to make non exact match for types in PreparedSetKey diff --git a/src/Processors/QueryPlan/CreatingSetsStep.cpp b/src/Processors/QueryPlan/CreatingSetsStep.cpp index 97ec3393a7d..bd079c0b8a9 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.cpp +++ b/src/Processors/QueryPlan/CreatingSetsStep.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -121,9 +122,11 @@ void CreatingSetsStep::describePipeline(FormatSettings & settings) const IQueryPlanStep::describePipeline(processors, settings); } -void addCreatingSetsStep( - QueryPlan & query_plan, PreparedSets & prepared_sets, const SizeLimits & limits, ContextPtr context) +void addCreatingSetsStep(QueryPlan & query_plan, PreparedSetsPtr prepared_sets, ContextPtr context) { + if (!prepared_sets || prepared_sets->empty()) + return; + DataStreams input_streams; input_streams.emplace_back(query_plan.getCurrentDataStream()); @@ -131,18 +134,19 @@ void addCreatingSetsStep( plans.emplace_back(std::make_unique(std::move(query_plan))); query_plan = QueryPlan(); - for (auto & [description, subquery_for_set] : prepared_sets.moveSubqueries()) + for (auto & [description, subquery_for_set] : prepared_sets->detachSubqueries()) { if (!subquery_for_set.hasSource()) continue; - auto plan = subquery_for_set.moveSource(); + auto plan = subquery_for_set.detachSource(); + const Settings & settings = context->getSettingsRef(); auto creating_set = std::make_unique( plan->getCurrentDataStream(), description, std::move(subquery_for_set), - limits, + SizeLimits(settings.max_rows_to_transfer, settings.max_bytes_to_transfer, settings.transfer_overflow_mode), context); creating_set->setStepDescription("Create set for subquery"); plan->addStep(std::move(creating_set)); diff --git a/src/Processors/QueryPlan/CreatingSetsStep.h b/src/Processors/QueryPlan/CreatingSetsStep.h index 15bc33b5f56..9c61eb2012c 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.h +++ b/src/Processors/QueryPlan/CreatingSetsStep.h @@ -49,10 +49,6 @@ private: Processors processors; }; -void addCreatingSetsStep( - QueryPlan & query_plan, - PreparedSets & prepared_sets, - const SizeLimits & limits, - ContextPtr context); +void addCreatingSetsStep(QueryPlan & query_plan, PreparedSetsPtr prepared_sets, ContextPtr context); } diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 83e7eee263e..7c71e395068 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -890,7 +890,7 @@ bool KeyCondition::tryPrepareSetIndex( SetPtr prepared_set; if (right_arg->as() || right_arg->as()) { - prepared_set = prepared_sets->getSet(PreparedSetKey::forSubquery(*right_arg)); + prepared_set = prepared_sets->get(PreparedSetKey::forSubquery(*right_arg)); if (!prepared_sets) return false; } diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp index 54b88c623f5..a80f7093775 100644 --- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp @@ -692,7 +692,7 @@ SetPtr MergeTreeIndexConditionBloomFilter::getPreparedSet(const ASTPtr & node) { const auto & column_and_type = header.getByName(node->getColumnName()); auto set_key = getPreparedSetKey(node, column_and_type.type); - if (auto prepared_set = query_info.prepared_sets->getSet(set_key)) + if (auto prepared_set = query_info.prepared_sets->get(set_key)) return prepared_set; } else diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 8cf4b615e56..ff924290783 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -609,7 +609,7 @@ bool MergeTreeConditionFullText::tryPrepareSetBloomFilter( else set_key = PreparedSetKey::forLiteral(*right_arg, data_types); - auto prepared_set = prepared_sets->getSet(set_key); + auto prepared_set = prepared_sets->get(set_key); if (!prepared_set) return false; diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index 05496f817f3..617e58c525b 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -126,7 +126,7 @@ static bool traverseASTFilter( else set_key = PreparedSetKey::forLiteral(*value, {primary_key_type}); - SetPtr set = prepared_sets->getSet(set_key); + SetPtr set = prepared_sets->get(set_key); if (!set) return false; From bec7408a0cb775f597eade86909d136d557fd50a Mon Sep 17 00:00:00 2001 From: root Date: Thu, 28 Jul 2022 09:20:28 -0700 Subject: [PATCH 051/164] reflected change requests asked on July 27 --- programs/server/config.xml | 2 +- src/Daemon/BaseDaemon.cpp | 2 +- src/Loggers/Loggers.cpp | 8 +-- src/Loggers/OwnFormattingChannel.cpp | 13 ++--- src/Loggers/OwnJSONPatternFormatter.cpp | 70 ++++++------------------- src/Loggers/OwnJSONPatternFormatter.h | 2 +- src/Loggers/OwnPatternFormatter.cpp | 4 -- src/Loggers/OwnPatternFormatter.h | 3 +- 8 files changed, 30 insertions(+), 74 deletions(-) diff --git a/programs/server/config.xml b/programs/server/config.xml index d261c1e3694..ef68741f056 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -68,7 +68,7 @@ To enable JSON logging support, just uncomment tag. Having the tag will make it work. For better understanding/visibility, you can add "true" or "1". --> - + diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index e162360ddaa..61ad1785b2d 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -1014,7 +1014,7 @@ void BaseDaemon::setupWatchdog() if (config().getRawString("logger.stream_compress", "false") == "true") { Poco::AutoPtr pf; - if (config().has("logger.json")) + if (config().getString("logger.formatting", "") == "json") pf = new OwnJSONPatternFormatter; else pf = new OwnPatternFormatter(true); diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index 6f8c88a7e87..35415fb91ba 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -99,7 +99,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log Poco::AutoPtr pf; - if (config.has("logger.json")) + if (config.getString("logger.formatting", "") == "json") pf = new OwnJSONPatternFormatter; else pf = new OwnPatternFormatter(true); @@ -140,7 +140,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log Poco::AutoPtr pf; - if (config.has("logger.json")) + if (config.getString("logger.formatting", "") == "json") pf = new OwnJSONPatternFormatter; else pf = new OwnPatternFormatter(true); @@ -184,7 +184,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log Poco::AutoPtr pf; - if (config.has("logger.json")) + if (config.getString("logger.formatting", "") == "json") pf = new OwnJSONPatternFormatter; else pf = new OwnPatternFormatter(true); @@ -211,7 +211,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log } Poco::AutoPtr pf; - if (config.has("logger.json")) + if (config.getString("logger.formatting", "") == "json") pf = new OwnJSONPatternFormatter; else pf = new OwnPatternFormatter(color_enabled); diff --git a/src/Loggers/OwnFormattingChannel.cpp b/src/Loggers/OwnFormattingChannel.cpp index 1487c5ed03b..f03d155bde7 100644 --- a/src/Loggers/OwnFormattingChannel.cpp +++ b/src/Loggers/OwnFormattingChannel.cpp @@ -1,20 +1,17 @@ #include "OwnFormattingChannel.h" -#include "OwnJSONPatternFormatter.h" #include "OwnPatternFormatter.h" + + namespace DB { + void OwnFormattingChannel::logExtended(const ExtendedLogMessage & msg) { if (pChannel && priority >= msg.base.getPriority()) { - std::string text; - if (auto * formatter = dynamic_cast(pFormatter.get())) - { - formatter->formatExtended(msg, text); - pChannel->log(Poco::Message(msg.base, text)); - } - else if (pFormatter) + if (pFormatter) { + std::string text; pFormatter->formatExtended(msg, text); pChannel->log(Poco::Message(msg.base, text)); } diff --git a/src/Loggers/OwnJSONPatternFormatter.cpp b/src/Loggers/OwnJSONPatternFormatter.cpp index 63f2c60f70e..afbdc2e7ecf 100644 --- a/src/Loggers/OwnJSONPatternFormatter.cpp +++ b/src/Loggers/OwnJSONPatternFormatter.cpp @@ -13,19 +13,16 @@ OwnJSONPatternFormatter::OwnJSONPatternFormatter() : OwnPatternFormatter("") } -void OwnJSONPatternFormatter::formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text) +void OwnJSONPatternFormatter::formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text) const { DB::WriteBufferFromString wb(text); DB::FormatSettings settings; - char key_name[] = "a placeholder for key names in structured logging"; - char empty_string[] = ""; - + const Poco::Message & msg = msg_ext.base; DB::writeChar('{', wb); - strcpy(key_name, "date_time"); - writeJSONString(key_name, key_name + strlen(key_name), wb, settings); + writeJSONString("date_time", wb, settings); DB::writeChar(':', wb); DB::writeChar('\"', wb); @@ -42,20 +39,14 @@ void OwnJSONPatternFormatter::formatExtended(const DB::ExtendedLogMessage & msg_ DB::writeChar(',', wb); - strcpy(key_name, "thread_name"); - writeJSONString(key_name, key_name + strlen(key_name), wb, settings); + writeJSONString("thread_name", wb, settings); DB::writeChar(':', wb); - const char * thread_name = msg.getThread().c_str(); - if (thread_name != nullptr) - writeJSONString(thread_name, thread_name + strlen(thread_name), wb, settings); - else - writeJSONString(empty_string, empty_string + strlen(empty_string), wb, settings); + writeJSONString(msg.getThread(), wb, settings); DB::writeChar(',', wb); - strcpy(key_name, "thread_id"); - writeJSONString(key_name, key_name + strlen(key_name), wb, settings); + writeJSONString("thread_id", wb, settings); DB::writeChar(':', wb); DB::writeChar('\"', wb); DB::writeIntText(msg_ext.thread_id, wb); @@ -63,65 +54,38 @@ void OwnJSONPatternFormatter::formatExtended(const DB::ExtendedLogMessage & msg_ DB::writeChar(',', wb); - strcpy(key_name, "level"); - writeJSONString(key_name, key_name + strlen(key_name), wb, settings); + writeJSONString("level", wb, settings); DB::writeChar(':', wb); - int priority_int = static_cast(msg.getPriority()); - String priority_str = std::to_string(priority_int); - const char * priority = priority_str.c_str(); - if (priority != nullptr) - writeJSONString(priority, priority + strlen(priority), wb, settings); - else - writeJSONString(empty_string, empty_string + strlen(empty_string), wb, settings); - + int priority = static_cast(msg.getPriority()); + writeJSONString(std::to_string(priority), wb, settings); DB::writeChar(',', wb); /// We write query_id even in case when it is empty (no query context) /// just to be convenient for various log parsers. - strcpy(key_name, "query_id"); - writeJSONString(key_name, key_name + strlen(key_name), wb, settings); + writeJSONString("query_id", wb, settings); DB::writeChar(':', wb); writeJSONString(msg_ext.query_id, wb, settings); DB::writeChar(',', wb); - strcpy(key_name, "logger_name"); - writeJSONString(key_name, key_name + strlen(key_name), wb, settings); + writeJSONString("logger_name", wb, settings); DB::writeChar(':', wb); - const char * logger_name = msg.getSource().c_str(); - if (logger_name != nullptr) - writeJSONString(logger_name, logger_name + strlen(logger_name), wb, settings); - else - writeJSONString(empty_string, empty_string + strlen(empty_string), wb, settings); - + writeJSONString(msg.getSource(), wb, settings); DB::writeChar(',', wb); - strcpy(key_name, "message"); - writeJSONString(key_name, key_name + strlen(key_name), wb, settings); + writeJSONString("message", wb, settings); DB::writeChar(':', wb); - const char * msg_text = msg.getText().c_str(); - if (msg_text != nullptr) - writeJSONString(msg_text, msg_text + strlen(msg_text), wb, settings); - else - writeJSONString(empty_string, empty_string + strlen(empty_string), wb, settings); - + writeJSONString(msg.getText(), wb, settings); DB::writeChar(',', wb); - strcpy(key_name, "source_file"); - writeJSONString(key_name, key_name + strlen(key_name), wb, settings); + writeJSONString("source_file", wb, settings); DB::writeChar(':', wb); - const char * source_file = msg.getSourceFile(); - if (source_file != nullptr) - writeJSONString(source_file, source_file + strlen(source_file), wb, settings); - else - writeJSONString(empty_string, empty_string + strlen(empty_string), wb, settings); - + writeJSONString(msg.getSourceFile(), wb, settings); DB::writeChar(',', wb); - strcpy(key_name, "source_line"); - writeJSONString(key_name, key_name + strlen(key_name), wb, settings); + writeJSONString("source_line", wb, settings); DB::writeChar(':', wb); DB::writeChar('\"', wb); DB::writeIntText(msg.getSourceLine(), wb); diff --git a/src/Loggers/OwnJSONPatternFormatter.h b/src/Loggers/OwnJSONPatternFormatter.h index 76a0104317e..032506b15e3 100644 --- a/src/Loggers/OwnJSONPatternFormatter.h +++ b/src/Loggers/OwnJSONPatternFormatter.h @@ -28,5 +28,5 @@ public: OwnJSONPatternFormatter(); void format(const Poco::Message & msg, std::string & text) override; - static void formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text); + void formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text) const override; }; diff --git a/src/Loggers/OwnPatternFormatter.cpp b/src/Loggers/OwnPatternFormatter.cpp index f5ee60a2113..67e038f4ea1 100644 --- a/src/Loggers/OwnPatternFormatter.cpp +++ b/src/Loggers/OwnPatternFormatter.cpp @@ -13,10 +13,6 @@ OwnPatternFormatter::OwnPatternFormatter(bool color_) : Poco::PatternFormatter(" { } -OwnPatternFormatter::OwnPatternFormatter() : Poco::PatternFormatter("") -{ -} - void OwnPatternFormatter::formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text) const { DB::WriteBufferFromString wb(text); diff --git a/src/Loggers/OwnPatternFormatter.h b/src/Loggers/OwnPatternFormatter.h index 154068f75fe..d776b097cb2 100644 --- a/src/Loggers/OwnPatternFormatter.h +++ b/src/Loggers/OwnPatternFormatter.h @@ -25,10 +25,9 @@ class OwnPatternFormatter : public Poco::PatternFormatter { public: OwnPatternFormatter(bool color_ = false); - OwnPatternFormatter(); void format(const Poco::Message & msg, std::string & text) override; - void formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text) const; + virtual void formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text) const; private: bool color; From bb8f7f6f0bb58d8db1f41a2db62eec7fb62aca46 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 28 Jul 2022 09:52:30 -0700 Subject: [PATCH 052/164] style correction in OwnJSONPatternFormatter.cpp --- src/Loggers/OwnJSONPatternFormatter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Loggers/OwnJSONPatternFormatter.cpp b/src/Loggers/OwnJSONPatternFormatter.cpp index afbdc2e7ecf..4ab2066b548 100644 --- a/src/Loggers/OwnJSONPatternFormatter.cpp +++ b/src/Loggers/OwnJSONPatternFormatter.cpp @@ -18,7 +18,7 @@ void OwnJSONPatternFormatter::formatExtended(const DB::ExtendedLogMessage & msg_ DB::WriteBufferFromString wb(text); DB::FormatSettings settings; - + const Poco::Message & msg = msg_ext.base; DB::writeChar('{', wb); From f9e4d6370db9ac181b683c6c6772c94ec7da9f72 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 28 Jul 2022 23:51:59 -0700 Subject: [PATCH 053/164] modified integration test as suggested --- programs/server/config.xml | 9 ++++----- .../configs/config_json.xml | 14 ++++++++++++++ .../test_structured_logging_json/test.py | 10 +--------- 3 files changed, 19 insertions(+), 14 deletions(-) create mode 100644 tests/integration/test_structured_logging_json/configs/config_json.xml diff --git a/programs/server/config.xml b/programs/server/config.xml index ef68741f056..0f759caafef 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -61,14 +61,13 @@ --> - + json diff --git a/tests/integration/test_structured_logging_json/configs/config_json.xml b/tests/integration/test_structured_logging_json/configs/config_json.xml new file mode 100644 index 00000000000..a0b158fece3 --- /dev/null +++ b/tests/integration/test_structured_logging_json/configs/config_json.xml @@ -0,0 +1,14 @@ + + + + + + + + diff --git a/tests/integration/test_structured_logging_json/test.py b/tests/integration/test_structured_logging_json/test.py index 34507a605c6..3a673254051 100644 --- a/tests/integration/test_structured_logging_json/test.py +++ b/tests/integration/test_structured_logging_json/test.py @@ -1,11 +1,9 @@ import pytest from helpers.cluster import ClickHouseCluster -import logging import json -from xml.etree import ElementTree cluster = ClickHouseCluster(__file__) -node = cluster.add_instance("node", stay_alive=True) +node = cluster.add_instance("node", main_configs=["configs/config_json.xml"]) @pytest.fixture(scope="module") @@ -37,12 +35,6 @@ def is_json(log_json): def test_structured_logging_json_format(start_cluster): - config = node.exec_in_container(["cat", "/etc/clickhouse-server/config.xml"]) - root = ElementTree.fromstring(config) - for logger in root.findall("logger"): - if logger.find("json") is None: - pytest.skip("JSON is not activated in config.xml") - node.query("SELECT 1") logs = node.grep_in_log(" ") From 39369be318aec04685d5c26793ba26eaa01c8f89 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 28 Jul 2022 23:56:26 -0700 Subject: [PATCH 054/164] commented out formatting tag in config.xml --- programs/server/config.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/programs/server/config.xml b/programs/server/config.xml index 0f759caafef..1247eb0e60f 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -62,12 +62,12 @@ --> - json + From 16c32b8b3f01b0ab25a0f549e6286e39bee5fa6f Mon Sep 17 00:00:00 2001 From: root Date: Fri, 29 Jul 2022 00:07:31 -0700 Subject: [PATCH 055/164] removed True from assert --- tests/integration/test_structured_logging_json/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_structured_logging_json/test.py b/tests/integration/test_structured_logging_json/test.py index 3a673254051..fdbf3651a0a 100644 --- a/tests/integration/test_structured_logging_json/test.py +++ b/tests/integration/test_structured_logging_json/test.py @@ -46,4 +46,4 @@ def test_structured_logging_json_format(start_cluster): # we will test maximum 5 logs if i >= min(4, len(log_array) - 1): break - assert result == True + assert result From 68f825acd4ea10861ef8d3b56d909d33d57f0374 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 29 Jul 2022 05:09:48 -0700 Subject: [PATCH 056/164] uncommented formatting tag --- .../test_structured_logging_json/configs/config_json.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_structured_logging_json/configs/config_json.xml b/tests/integration/test_structured_logging_json/configs/config_json.xml index a0b158fece3..c0bd4e86a01 100644 --- a/tests/integration/test_structured_logging_json/configs/config_json.xml +++ b/tests/integration/test_structured_logging_json/configs/config_json.xml @@ -8,7 +8,7 @@ {"date_time":"1650918987.180175","thread_name":"#1","thread_id":"254545","level":"Trace","query_id":"","logger_name":"BaseDaemon","message":"Received signal 2","source_file":"../base/daemon/BaseDaemon.cpp; virtual void SignalListener::run()","source_line":"192"} To enable JSON logging support, just uncomment tag below. --> - + json From a0d51601bf9aade0bd9303588b7d032b804f2442 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 1 Aug 2022 13:07:48 +0200 Subject: [PATCH 057/164] Update EscapingRuleUtils.cpp --- src/Formats/EscapingRuleUtils.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 69684b67071..f8c002e87ee 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -327,7 +327,6 @@ void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings /// Check settings specific for JSON formats. /// If we have numbers and strings, convert numbers to strings. - /// (Actually numbers could not be parsed from if (settings.json.try_infer_numbers_from_strings) { bool have_strings = false; From 1c1e7eb0cd2d5f841979723ba27a7abc7ab47d6e Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 1 Aug 2022 13:16:23 +0200 Subject: [PATCH 058/164] Fix duplicate lines --- src/DataTypes/transformTypesRecursively.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/DataTypes/transformTypesRecursively.cpp b/src/DataTypes/transformTypesRecursively.cpp index 48e9dc60c19..3544c7e477d 100644 --- a/src/DataTypes/transformTypesRecursively.cpp +++ b/src/DataTypes/transformTypesRecursively.cpp @@ -87,9 +87,6 @@ void transformTypesRecursively(DataTypes & types, std::function(transposed_nested_types[i]); - - if (transform_complex_types) - transform_complex_types(types); } if (transform_complex_types) From 433b961d5c34c55e8b5f0ce773ce3cf1f6c07538 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 1 Aug 2022 20:21:48 -0700 Subject: [PATCH 059/164] changed code for expected seg fault --- src/Loggers/OwnJSONPatternFormatter.cpp | 6 ++++- .../test_structured_logging_json/test.py | 27 +++---------------- 2 files changed, 9 insertions(+), 24 deletions(-) diff --git a/src/Loggers/OwnJSONPatternFormatter.cpp b/src/Loggers/OwnJSONPatternFormatter.cpp index 4ab2066b548..d12506e2451 100644 --- a/src/Loggers/OwnJSONPatternFormatter.cpp +++ b/src/Loggers/OwnJSONPatternFormatter.cpp @@ -82,7 +82,11 @@ void OwnJSONPatternFormatter::formatExtended(const DB::ExtendedLogMessage & msg_ writeJSONString("source_file", wb, settings); DB::writeChar(':', wb); - writeJSONString(msg.getSourceFile(), wb, settings); + const char * source_file = msg.getSourceFile(); + if (source_file != nullptr) + writeJSONString(source_file, wb, settings); + else + writeJSONString("", wb, settings); DB::writeChar(',', wb); writeJSONString("source_line", wb, settings); diff --git a/tests/integration/test_structured_logging_json/test.py b/tests/integration/test_structured_logging_json/test.py index fdbf3651a0a..fff9365b45e 100644 --- a/tests/integration/test_structured_logging_json/test.py +++ b/tests/integration/test_structured_logging_json/test.py @@ -5,7 +5,6 @@ import json cluster = ClickHouseCluster(__file__) node = cluster.add_instance("node", main_configs=["configs/config_json.xml"]) - @pytest.fixture(scope="module") def start_cluster(): try: @@ -14,18 +13,6 @@ def start_cluster(): finally: cluster.shutdown() - -def get_log_array(logs): - log_array = [] - temp_log = "" - for i in range(0, len(logs)): - temp_log += logs[i] - if logs[i] == "}": - log_array.append(temp_log) - temp_log = "" - return log_array - - def is_json(log_json): try: json.loads(log_json) @@ -37,13 +24,7 @@ def is_json(log_json): def test_structured_logging_json_format(start_cluster): node.query("SELECT 1") - logs = node.grep_in_log(" ") - log_array = get_log_array(logs) - result = True - for i in range(0, len(log_array)): - temporary_result = is_json(log_array[i]) - result &= temporary_result - # we will test maximum 5 logs - if i >= min(4, len(log_array) - 1): - break - assert result + logs = node.grep_in_log("").split('\n') + length = min(10, len(logs)) + for i in range(0, length): + assert is_json(logs[i]) From 495df1da07d662904f1541d58a13da479e31b080 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 1 Aug 2022 20:48:57 -0700 Subject: [PATCH 060/164] reformat test.py to avoid style-check error --- tests/integration/test_structured_logging_json/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_structured_logging_json/test.py b/tests/integration/test_structured_logging_json/test.py index fff9365b45e..d56e9a41968 100644 --- a/tests/integration/test_structured_logging_json/test.py +++ b/tests/integration/test_structured_logging_json/test.py @@ -5,6 +5,7 @@ import json cluster = ClickHouseCluster(__file__) node = cluster.add_instance("node", main_configs=["configs/config_json.xml"]) + @pytest.fixture(scope="module") def start_cluster(): try: @@ -13,6 +14,7 @@ def start_cluster(): finally: cluster.shutdown() + def is_json(log_json): try: json.loads(log_json) @@ -24,7 +26,7 @@ def is_json(log_json): def test_structured_logging_json_format(start_cluster): node.query("SELECT 1") - logs = node.grep_in_log("").split('\n') + logs = node.grep_in_log("").split("\n") length = min(10, len(logs)) for i in range(0, length): assert is_json(logs[i]) From 6976c690bda83689766c6152ffd6c33c43c9fdbf Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 2 Aug 2022 10:09:18 +0200 Subject: [PATCH 061/164] Revert "Revert "Update arrow to fix possible data race"" --- contrib/arrow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/arrow b/contrib/arrow index efdcd015cfd..3e03c6de41a 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit efdcd015cfdee1b6aa349c9ca227ca12c3d697f5 +Subproject commit 3e03c6de41a86df2fc54a61e9be1abaefeff6b0e From 3be13e4f925fdca311e32830e4bd569e63dada7b Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 2 Aug 2022 08:25:46 +0000 Subject: [PATCH 062/164] Try fix build under ppc64 --- contrib/arrow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/arrow b/contrib/arrow index 3e03c6de41a..611e4a63107 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 3e03c6de41a86df2fc54a61e9be1abaefeff6b0e +Subproject commit 611e4a631072d822074f6ea119a2b8d20c8760ca From 0fad007220b48958dccc79d071e247622eb67661 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 2 Aug 2022 06:42:50 -0700 Subject: [PATCH 063/164] changing OwnPatternFormatter.cpp to its original version --- src/Loggers/OwnPatternFormatter.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Loggers/OwnPatternFormatter.cpp b/src/Loggers/OwnPatternFormatter.cpp index 67e038f4ea1..02a2c2e510b 100644 --- a/src/Loggers/OwnPatternFormatter.cpp +++ b/src/Loggers/OwnPatternFormatter.cpp @@ -3,16 +3,18 @@ #include #include #include -#include -#include -#include #include +#include +#include +#include -OwnPatternFormatter::OwnPatternFormatter(bool color_) : Poco::PatternFormatter(""), color(color_) +OwnPatternFormatter::OwnPatternFormatter(bool color_) + : Poco::PatternFormatter(""), color(color_) { } + void OwnPatternFormatter::formatExtended(const DB::ExtendedLogMessage & msg_ext, std::string & text) const { DB::WriteBufferFromString wb(text); From d58ae7871290f6d429dead539b7a00ab0b915a0b Mon Sep 17 00:00:00 2001 From: root Date: Wed, 3 Aug 2022 06:30:48 -0700 Subject: [PATCH 064/164] removed default color enable in OwnPatternFormatter constructor calling --- src/Daemon/BaseDaemon.cpp | 2 +- src/Loggers/Loggers.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 61ad1785b2d..d9a12b1640d 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -1017,7 +1017,7 @@ void BaseDaemon::setupWatchdog() if (config().getString("logger.formatting", "") == "json") pf = new OwnJSONPatternFormatter; else - pf = new OwnPatternFormatter(true); + pf = new OwnPatternFormatter; Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel(std::cerr)); logger().setChannel(log); } diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index 35415fb91ba..dec6bcd51c7 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -102,7 +102,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log if (config.getString("logger.formatting", "") == "json") pf = new OwnJSONPatternFormatter; else - pf = new OwnPatternFormatter(true); + pf = new OwnPatternFormatter; Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, log_file); log->setLevel(log_level); @@ -143,7 +143,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log if (config.getString("logger.formatting", "") == "json") pf = new OwnJSONPatternFormatter; else - pf = new OwnPatternFormatter(true); + pf = new OwnPatternFormatter; Poco::AutoPtr errorlog = new DB::OwnFormattingChannel(pf, error_log_file); errorlog->setLevel(errorlog_level); @@ -187,7 +187,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log if (config.getString("logger.formatting", "") == "json") pf = new OwnJSONPatternFormatter; else - pf = new OwnPatternFormatter(true); + pf = new OwnPatternFormatter; Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, syslog_channel); log->setLevel(syslog_level); From b41b716245dea528cc2f592dea9da3954b088a5a Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 5 Aug 2022 18:16:35 +0200 Subject: [PATCH 065/164] Update AsynchronousMetrics.cpp --- src/Interpreters/AsynchronousMetrics.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 81fdef3d8a6..c1102a0652d 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -34,12 +34,6 @@ #endif -namespace CurrentMetrics -{ - extern const Metric MemoryTracking; -} - - namespace DB { From 2d7de7f683abed08412a333447ca99c2372f9742 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 5 Aug 2022 14:42:40 +0300 Subject: [PATCH 066/164] Remove dictionaries from prometheus metrics on DETACH/DROP Fixes: #23436 (cc @kitaisreal) Introduced-in: #9622 (cc @YiuRULE) Signed-off-by: Azat Khuzhin --- src/Interpreters/ExternalLoader.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Interpreters/ExternalLoader.cpp b/src/Interpreters/ExternalLoader.cpp index 1dcf48da130..704dff325b7 100644 --- a/src/Interpreters/ExternalLoader.cpp +++ b/src/Interpreters/ExternalLoader.cpp @@ -1299,6 +1299,7 @@ scope_guard ExternalLoader::addConfigRepository(std::unique_ptrremoveConfigRepository(ptr); + CurrentStatusInfo::unset(CurrentStatusInfo::DictionaryStatus, name); reloadConfig(name); }; } From 479ea9e6a6de10be44b9ddb8765134e836fdec4c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 5 Aug 2022 15:06:34 +0300 Subject: [PATCH 067/164] tests: enable prometheus exporter Signed-off-by: Azat Khuzhin --- tests/config/config.d/prometheus.xml | 6 ++++++ tests/config/install.sh | 1 + 2 files changed, 7 insertions(+) create mode 100644 tests/config/config.d/prometheus.xml diff --git a/tests/config/config.d/prometheus.xml b/tests/config/config.d/prometheus.xml new file mode 100644 index 00000000000..7f8dbd1601f --- /dev/null +++ b/tests/config/config.d/prometheus.xml @@ -0,0 +1,6 @@ + + + /metrics + 9988 + + diff --git a/tests/config/install.sh b/tests/config/install.sh index 478601620e1..af492bb18b4 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -35,6 +35,7 @@ ln -sf $SRC_PATH/config.d/logging_no_rotate.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/merge_tree.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/metadata_cache.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/tcp_with_proxy.xml $DEST_SERVER_PATH/config.d/ +ln -sf $SRC_PATH/config.d/prometheus.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/top_level_domains_lists.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/top_level_domains_path.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/transactions.xml $DEST_SERVER_PATH/config.d/ From 96429e293a429b7eef3499c7dc689d6391aebd57 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 5 Aug 2022 15:20:37 +0300 Subject: [PATCH 068/164] tests: cover ClickHouseStatusInfo_DictionaryStatus in prometheus metrics Signed-off-by: Azat Khuzhin --- ...HouseStatusInfo_DictionaryStatus.reference | 18 +++++++++ ...s_ClickHouseStatusInfo_DictionaryStatus.sh | 37 +++++++++++++++++++ tests/queries/shell_config.sh | 4 ++ 3 files changed, 59 insertions(+) create mode 100644 tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.reference create mode 100755 tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.sh diff --git a/tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.reference b/tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.reference new file mode 100644 index 00000000000..50c91c3fa0c --- /dev/null +++ b/tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.reference @@ -0,0 +1,18 @@ +status before reload +status after reload +NOT_LOADED 0 +LOADED 0 +FAILED 1 +LOADING 0 +FAILED_AND_RELOADING 0 +LOADED_AND_RELOADING 0 +NOT_EXIST 0 +status after reload, table exists +NOT_LOADED 0 +LOADED 1 +FAILED 0 +LOADING 0 +FAILED_AND_RELOADING 0 +LOADED_AND_RELOADING 0 +NOT_EXIST 0 +status after drop diff --git a/tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.sh b/tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.sh new file mode 100755 index 00000000000..43f6d62bd10 --- /dev/null +++ b/tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +function get_dictionary_status() +{ + local name=$1 && shift + $CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL_PROMETHEUS" | { + awk -F'[{}=," ]' -vname="$name" '/ClickHouseStatusInfo_DictionaryStatus{/ && $(NF-3) == name { print $4, $NF }' + } +} + +$CLICKHOUSE_CLIENT -q "CREATE DICTIONARY dict (key Int, value String) PRIMARY KEY key SOURCE(CLICKHOUSE(TABLE data)) LAYOUT(HASHED()) LIFETIME(0)" +uuid="$($CLICKHOUSE_CLIENT -q "SELECT uuid FROM system.dictionaries WHERE database = '$CLICKHOUSE_DATABASE' AND name = 'dict'")" + +echo 'status before reload' +get_dictionary_status "$uuid" + +# source table does not exists +# NOTE: when dictionary does not exist it produce BAD_ARGUMENTS error, so using UNKNOWN_TABLE is safe +$CLICKHOUSE_CLIENT -n -q "SYSTEM RELOAD DICTIONARY dict -- { serverError UNKNOWN_TABLE }" +echo 'status after reload' +get_dictionary_status "$uuid" + +# create source +$CLICKHOUSE_CLIENT -q "CREATE TABLE data (key Int, value String) Engine=Null" +$CLICKHOUSE_CLIENT -q "SYSTEM RELOAD DICTIONARY dict" +echo 'status after reload, table exists' +get_dictionary_status "$uuid" + +# remove dictionary +$CLICKHOUSE_CLIENT -q "DROP DICTIONARY dict" +$CLICKHOUSE_CLIENT -q "DROP TABLE data" +echo 'status after drop' +get_dictionary_status "$uuid" diff --git a/tests/queries/shell_config.sh b/tests/queries/shell_config.sh index ab5d5ddc1b6..963ac384148 100644 --- a/tests/queries/shell_config.sh +++ b/tests/queries/shell_config.sh @@ -66,6 +66,8 @@ export CLICKHOUSE_PORT_TCP_WITH_PROXY=${CLICKHOUSE_PORT_TCP_WITH_PROXY:=$(${CLIC export CLICKHOUSE_PORT_TCP_WITH_PROXY=${CLICKHOUSE_PORT_TCP_WITH_PROXY:="9010"} export CLICKHOUSE_PORT_HTTP=${CLICKHOUSE_PORT_HTTP:=$(${CLICKHOUSE_EXTRACT_CONFIG} --key=http_port 2>/dev/null)} export CLICKHOUSE_PORT_HTTP=${CLICKHOUSE_PORT_HTTP:="8123"} +export CLICKHOUSE_PORT_PROMTHEUS_PORT=${CLICKHOUSE_PORT_PROMTHEUS_PORT:=$(${CLICKHOUSE_EXTRACT_CONFIG} --key=prometheus.port 2>/dev/null)} +export CLICKHOUSE_PORT_PROMTHEUS_PORT=${CLICKHOUSE_PORT_PROMTHEUS_PORT:="9988"} export CLICKHOUSE_PORT_HTTPS=${CLICKHOUSE_PORT_HTTPS:=$(${CLICKHOUSE_EXTRACT_CONFIG} --try --key=https_port 2>/dev/null)} 2>/dev/null export CLICKHOUSE_PORT_HTTPS=${CLICKHOUSE_PORT_HTTPS:="8443"} export CLICKHOUSE_PORT_HTTP_PROTO=${CLICKHOUSE_PORT_HTTP_PROTO:="http"} @@ -98,6 +100,8 @@ then export CLICKHOUSE_URL_HTTPS="${CLICKHOUSE_URL_HTTPS}?${CLICKHOUSE_URL_PARAMS}" fi +export CLICKHOUSE_URL_PROMETHEUS=${CLICKHOUSE_URL_PROMETHEUS:="${CLICKHOUSE_PORT_HTTP_PROTO}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_PROMTHEUS_PORT}/metrics"} + export CLICKHOUSE_PORT_INTERSERVER=${CLICKHOUSE_PORT_INTERSERVER:=$(${CLICKHOUSE_EXTRACT_CONFIG} --try --key=interserver_http_port 2>/dev/null)} 2>/dev/null export CLICKHOUSE_PORT_INTERSERVER=${CLICKHOUSE_PORT_INTERSERVER:="9009"} export CLICKHOUSE_URL_INTERSERVER=${CLICKHOUSE_URL_INTERSERVER:="${CLICKHOUSE_PORT_HTTP_PROTO}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_INTERSERVER}/"} From 791377e4dc0247b9dec39281779bde927b7e8424 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 6 Aug 2022 13:00:53 +0300 Subject: [PATCH 069/164] tests: avoid prometheus.port overlap for replicated database Signed-off-by: Azat Khuzhin --- docker/test/stateless/run.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 075f588cae3..23210e6a7c9 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -58,6 +58,7 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]] --tcp_port 19000 --tcp_port_secure 19440 --http_port 18123 --https_port 18443 --interserver_http_port 19009 --tcp_with_proxy_port 19010 \ --mysql_port 19004 --postgresql_port 19005 \ --keeper_server.tcp_port 19181 --keeper_server.server_id 2 \ + --prometheus.port 19988 \ --macros.replica r2 # It doesn't work :( mkdir -p /var/run/clickhouse-server2 @@ -69,6 +70,7 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]] --tcp_port 29000 --tcp_port_secure 29440 --http_port 28123 --https_port 28443 --interserver_http_port 29009 --tcp_with_proxy_port 29010 \ --mysql_port 29004 --postgresql_port 29005 \ --keeper_server.tcp_port 29181 --keeper_server.server_id 3 \ + --prometheus.port 29988 \ --macros.shard s2 # It doesn't work :( MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000)) # min(MAX_RUN_TIME, 2.5 hours) From 1304e3487cd0af4a71da1a6bec24331fcd0ae8d0 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 8 Aug 2022 13:43:14 +0000 Subject: [PATCH 070/164] Add comments, remove unneded stuff --- src/Formats/EscapingRuleUtils.cpp | 10 +- .../Formats/RowInputFormatWithNamesAndTypes.h | 4 - ...modifiers_with_non-default_types.reference | 113 ------------------ 3 files changed, 9 insertions(+), 118 deletions(-) delete mode 100644 tests/queries/0_stateless/02313_group_by_modifiers_with_non-default_types.reference diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index f8c002e87ee..a5c4dd1dd47 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -288,8 +288,13 @@ void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings } } - /// If we have date/datetimes and smth else, convert them to string. /// If we have only dates and datetimes, convert dates to datetime. + /// If we have date/datetimes and smth else, convert them to string, because + /// There is a special case when we inferred both Date/DateTime and Int64 from Strings, + /// for example: "arr: ["2020-01-01", "2000"]" -> Tuple(Date, Int64), + /// so if we have Date/DateTime and smth else (not only String) we should + /// convert Date/DateTime back to String, so then we will be able to + /// convert Int64 back to String as well. if (settings.try_infer_dates || settings.try_infer_datetimes) { bool have_dates = false; @@ -349,6 +354,9 @@ void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings if (settings.json.read_bools_as_numbers) { + /// Note that have_floats and have_integers both cannot be + /// equal to true as in one of previous checks we convert + /// integers to floats if we have both. bool have_floats = false; bool have_integers = false; bool have_bools = false; diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index e6a587b446c..0afc0146846 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -141,9 +141,5 @@ private: FormatWithNamesAndTypesReader * format_reader; }; -/// [2, 2, 4, 0] -> [2, 4, 4, 0] -> [4, 4, 0] -> [4, 4, 0, 0] -/// [2, 4, 4, 2] -> [2, 8, 2, 0] -/// [2, 2, 4, 4] -> [2, 4, 4, 4] -> [4, 4, 4, 0], -> [4, 4, 8, 0] -> [4, 8, 0, 0] - } diff --git a/tests/queries/0_stateless/02313_group_by_modifiers_with_non-default_types.reference b/tests/queries/0_stateless/02313_group_by_modifiers_with_non-default_types.reference deleted file mode 100644 index 183c63d1222..00000000000 --- a/tests/queries/0_stateless/02313_group_by_modifiers_with_non-default_types.reference +++ /dev/null @@ -1,113 +0,0 @@ --- { echoOn } -SELECT - count() as d, a, b, c -FROM test02313 -GROUP BY ROLLUP(a, b, c) -ORDER BY d, a, b, c; -1 one default 0 -1 one default 2 -1 one default 4 -1 one default 6 -1 one default 8 -1 two non-default 1 -1 two non-default 3 -1 two non-default 5 -1 two non-default 7 -1 two non-default 9 -5 one default 0 -5 one default 0 -5 two default 0 -5 two non-default 0 -10 one default 0 -SELECT - count() as d, a, b, c -FROM test02313 -GROUP BY CUBE(a, b, c) -ORDER BY d, a, b, c; -1 one default 0 -1 one default 0 -1 one default 0 -1 one default 0 -1 one default 1 -1 one default 2 -1 one default 2 -1 one default 2 -1 one default 2 -1 one default 3 -1 one default 4 -1 one default 4 -1 one default 4 -1 one default 4 -1 one default 5 -1 one default 6 -1 one default 6 -1 one default 6 -1 one default 6 -1 one default 7 -1 one default 8 -1 one default 8 -1 one default 8 -1 one default 8 -1 one default 9 -1 one non-default 1 -1 one non-default 3 -1 one non-default 5 -1 one non-default 7 -1 one non-default 9 -1 two default 1 -1 two default 3 -1 two default 5 -1 two default 7 -1 two default 9 -1 two non-default 1 -1 two non-default 3 -1 two non-default 5 -1 two non-default 7 -1 two non-default 9 -5 one default 0 -5 one default 0 -5 one default 0 -5 one non-default 0 -5 two default 0 -5 two non-default 0 -10 one default 0 -SELECT - count() as d, a, b, c -FROM test02313 -GROUP BY GROUPING SETS - ( - (c), - (a, c), - (b, c) - ) -ORDER BY d, a, b, c; -1 one default 0 -1 one default 0 -1 one default 0 -1 one default 1 -1 one default 2 -1 one default 2 -1 one default 2 -1 one default 3 -1 one default 4 -1 one default 4 -1 one default 4 -1 one default 5 -1 one default 6 -1 one default 6 -1 one default 6 -1 one default 7 -1 one default 8 -1 one default 8 -1 one default 8 -1 one default 9 -1 one non-default 1 -1 one non-default 3 -1 one non-default 5 -1 one non-default 7 -1 one non-default 9 -1 two default 1 -1 two default 3 -1 two default 5 -1 two default 7 -1 two default 9 From 5cd3558d07044a7990428ed62da9a9074634f4f5 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 8 Aug 2022 14:10:13 +0000 Subject: [PATCH 071/164] Use different root path for counter test --- .../src/jepsen/clickhouse_keeper/counter.clj | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj index f82d3f4c348..60b29bd799a 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj @@ -9,6 +9,7 @@ [zookeeper :as zk]) (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) +(def root-path "/counter") (defn r [_ _] {:type :invoke, :f :read}) (defn add [_ _] {:type :invoke, :f :add, :value (rand-int 5)}) @@ -20,17 +21,19 @@ :conn (zk-connect node 9181 30000)) :nodename node)) - (setup! [this test]) + (setup! [this test] + (exec-with-retries 30 (fn [] + (zk-create-if-not-exists conn root-path "")))) (invoke! [this test op] (case (:f op) :read (exec-with-retries 30 (fn [] (assoc op :type :ok - :value (count (zk-list conn "/"))))) + :value (count (zk-list conn root-path))))) :add (try (do - (zk-multi-create-many-seq-nodes conn "/seq-" (:value op)) + (zk-multi-create-many-seq-nodes conn (concat-path root-path "seq-") (:value op)) (assoc op :type :ok)) (catch Exception _ (assoc op :type :info, :error :connect-error))))) From c8f8ceea8d5694eaf5ad4df41297a55e932d6fcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 8 Aug 2022 18:25:34 +0200 Subject: [PATCH 072/164] Fix hashId crash and salt not applying --- src/Functions/FunctionHashID.h | 7 ++++--- tests/queries/0_stateless/02293_hashid.reference | 7 +++++-- tests/queries/0_stateless/02293_hashid.sql | 8 ++++++++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/Functions/FunctionHashID.h b/src/Functions/FunctionHashID.h index 30f08c96eca..e469381a784 100644 --- a/src/Functions/FunctionHashID.h +++ b/src/Functions/FunctionHashID.h @@ -52,6 +52,7 @@ public: bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3}; } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { @@ -69,7 +70,7 @@ public: if (arguments.size() > 1) { const auto & hash_col = arguments[1]; - if (!isString(hash_col.type) || !isColumnConst(*hash_col.column.get())) + if (!isString(hash_col.type)) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument of function {} must be String, got {}", @@ -80,7 +81,7 @@ public: if (arguments.size() > 2) { const auto & min_length_col = arguments[2]; - if (!isUInt8(min_length_col.type) || !isColumnConst(*min_length_col.column.get())) + if (!isUInt8(min_length_col.type)) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Third argument of function {} must be UInt8, got {}", @@ -91,7 +92,7 @@ public: if (arguments.size() > 3) { const auto & alphabet_col = arguments[3]; - if (!isString(alphabet_col.type) || !isColumnConst(*alphabet_col.column.get())) + if (!isString(alphabet_col.type)) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Fourth argument of function {} must be String, got {}", diff --git a/tests/queries/0_stateless/02293_hashid.reference b/tests/queries/0_stateless/02293_hashid.reference index f36b1500288..dfc78349c05 100644 --- a/tests/queries/0_stateless/02293_hashid.reference +++ b/tests/queries/0_stateless/02293_hashid.reference @@ -8,5 +8,8 @@ 2 obmgndljgajpkeao 3 dldokmpjpgjgeanb 4 nkdlpgajngjnobme -xkOpDGxQpVB -jR +YQrvD5XGvbx +Bm3zaOq7zbp +oV +oV +6b diff --git a/tests/queries/0_stateless/02293_hashid.sql b/tests/queries/0_stateless/02293_hashid.sql index 45aaefe7356..173ec2789c2 100644 --- a/tests/queries/0_stateless/02293_hashid.sql +++ b/tests/queries/0_stateless/02293_hashid.sql @@ -3,5 +3,13 @@ SET allow_experimental_hash_functions = 1; select number, hashid(number) from system.numbers limit 5; select number, hashid(number, 's3cr3t', 16, 'abcdefghijklmnop') from system.numbers limit 5; select hashid(1234567890123456, 's3cr3t'); +select hashid(1234567890123456, 's3cr3t2'); SELECT hashid(1, hashid(2)); +SELECT hashid(1, 'k5'); +SELECT hashid(1, 'k5_othersalt'); + +-- https://github.com/ClickHouse/ClickHouse/issues/39672 +SELECT + JSONExtractRaw(257, NULL), + hashid(1024, if(rand() % 10, 'truetruetruetrue', NULL), 's3\0r3t'); -- {serverError 43} From e584d1276bfaa4f4d8556472286fe3ed49608a4d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 Aug 2022 14:04:41 +0000 Subject: [PATCH 073/164] Factorize generic code from ExternalDictLibBridgeHelper into base class - When catboost will be added into library-bridge later on, a lot of code will need to be duplicated. - Avoid that by factorizing stuff which is not specific to external dictionaries for reuse into a common base class. - This is a refactoring without semantic change. --- .../ExternalDictionaryLibraryBridgeHelper.cpp | 41 ++++----------- .../ExternalDictionaryLibraryBridgeHelper.h | 37 ++------------ src/BridgeHelper/LibraryBridgeHelper.cpp | 34 +++++++++++++ src/BridgeHelper/LibraryBridgeHelper.h | 50 +++++++++++++++++++ 4 files changed, 96 insertions(+), 66 deletions(-) create mode 100644 src/BridgeHelper/LibraryBridgeHelper.cpp create mode 100644 src/BridgeHelper/LibraryBridgeHelper.h diff --git a/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp b/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp index c4df7e74e7a..28425cfdb31 100644 --- a/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp +++ b/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp @@ -31,16 +31,10 @@ ExternalDictionaryLibraryBridgeHelper::ExternalDictionaryLibraryBridgeHelper( const Block & sample_block_, const Field & dictionary_id_, const LibraryInitData & library_data_) - : IBridgeHelper(context_->getGlobalContext()) - , log(&Poco::Logger::get("ExternalDictionaryLibraryBridgeHelper")) + : LibraryBridgeHelper(context_->getGlobalContext()) , sample_block(sample_block_) - , config(context_->getConfigRef()) - , http_timeout(context_->getGlobalContext()->getSettingsRef().http_receive_timeout.value) , library_data(library_data_) , dictionary_id(dictionary_id_) - , bridge_host(config.getString("library_bridge.host", DEFAULT_HOST)) - , bridge_port(config.getUInt("library_bridge.port", DEFAULT_PORT)) - , http_timeouts(ConnectionTimeouts::getHTTPTimeouts(context_)) { } @@ -71,22 +65,6 @@ Poco::URI ExternalDictionaryLibraryBridgeHelper::createRequestURI(const String & } -Poco::URI ExternalDictionaryLibraryBridgeHelper::createBaseURI() const -{ - Poco::URI uri; - uri.setHost(bridge_host); - uri.setPort(bridge_port); - uri.setScheme("http"); - return uri; -} - - -void ExternalDictionaryLibraryBridgeHelper::startBridge(std::unique_ptr cmd) const -{ - getContext()->addBridgeCommand(std::move(cmd)); -} - - bool ExternalDictionaryLibraryBridgeHelper::bridgeHandShake() { String result; @@ -225,6 +203,14 @@ QueryPipeline ExternalDictionaryLibraryBridgeHelper::loadAll() } +static String getDictIdsString(const std::vector & ids) +{ + WriteBufferFromOwnString out; + writeVectorBinary(ids, out); + return out.str(); +} + + QueryPipeline ExternalDictionaryLibraryBridgeHelper::loadIds(const std::vector & ids) { startBridgeSync(); @@ -283,13 +269,4 @@ QueryPipeline ExternalDictionaryLibraryBridgeHelper::loadBase(const Poco::URI & return QueryPipeline(std::move(source)); } - -String ExternalDictionaryLibraryBridgeHelper::getDictIdsString(const std::vector & ids) -{ - WriteBufferFromOwnString out; - writeVectorBinary(ids, out); - return out.str(); -} - - } diff --git a/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.h b/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.h index d7504bbcf0e..feebf997387 100644 --- a/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.h +++ b/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.h @@ -2,10 +2,9 @@ #include #include -#include #include #include -#include +#include #include @@ -14,7 +13,8 @@ namespace DB class Pipe; -class ExternalDictionaryLibraryBridgeHelper : public IBridgeHelper +// Class to access the external dictionary part of the clickhouse-library-bridge. +class ExternalDictionaryLibraryBridgeHelper : public LibraryBridgeHelper { public: @@ -25,7 +25,6 @@ public: String dict_attributes; }; - static constexpr inline size_t DEFAULT_PORT = 9012; static constexpr inline auto PING_HANDLER = "/extdict_ping"; static constexpr inline auto MAIN_HANDLER = "/extdict_request"; @@ -56,26 +55,6 @@ protected: bool bridgeHandShake() override; - void startBridge(std::unique_ptr cmd) const override; - - String serviceAlias() const override { return "clickhouse-library-bridge"; } - - String serviceFileName() const override { return serviceAlias(); } - - size_t getDefaultPort() const override { return DEFAULT_PORT; } - - bool startBridgeManually() const override { return false; } - - String configPrefix() const override { return "library_bridge"; } - - const Poco::Util::AbstractConfiguration & getConfig() const override { return config; } - - Poco::Logger * getLog() const override { return log; } - - Poco::Timespan getHTTPTimeout() const override { return http_timeout; } - - Poco::URI createBaseURI() const override; - QueryPipeline loadBase(const Poco::URI & uri, ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback = {}); bool executeRequest(const Poco::URI & uri, ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback = {}) const; @@ -94,20 +73,10 @@ private: Poco::URI createRequestURI(const String & method) const; - static String getDictIdsString(const std::vector & ids); - - Poco::Logger * log; const Block sample_block; - const Poco::Util::AbstractConfiguration & config; - const Poco::Timespan http_timeout; - LibraryInitData library_data; Field dictionary_id; - std::string bridge_host; - size_t bridge_port; bool library_initialized = false; - ConnectionTimeouts http_timeouts; - Poco::Net::HTTPBasicCredentials credentials{}; }; } diff --git a/src/BridgeHelper/LibraryBridgeHelper.cpp b/src/BridgeHelper/LibraryBridgeHelper.cpp new file mode 100644 index 00000000000..d83e62a0fd3 --- /dev/null +++ b/src/BridgeHelper/LibraryBridgeHelper.cpp @@ -0,0 +1,34 @@ +#include "LibraryBridgeHelper.h" + +namespace DB +{ + +LibraryBridgeHelper::LibraryBridgeHelper(ContextPtr context_) + : IBridgeHelper(context_) + , config(context_->getConfigRef()) + , log(&Poco::Logger::get("LibraryBridgeHelper")) + , http_timeout(context_->getGlobalContext()->getSettingsRef().http_receive_timeout.value) + , bridge_host(config.getString("library_bridge.host", DEFAULT_HOST)) + , bridge_port(config.getUInt("library_bridge.port", DEFAULT_PORT)) + , http_timeouts(ConnectionTimeouts::getHTTPTimeouts(context_)) +{ +} + + +void LibraryBridgeHelper::startBridge(std::unique_ptr cmd) const +{ + getContext()->addBridgeCommand(std::move(cmd)); +} + + +Poco::URI LibraryBridgeHelper::createBaseURI() const +{ + Poco::URI uri; + uri.setHost(bridge_host); + uri.setPort(bridge_port); + uri.setScheme("http"); + return uri; +} + + +} diff --git a/src/BridgeHelper/LibraryBridgeHelper.h b/src/BridgeHelper/LibraryBridgeHelper.h new file mode 100644 index 00000000000..a00421570fe --- /dev/null +++ b/src/BridgeHelper/LibraryBridgeHelper.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +// Common base class to access the clickhouse-library-bridge. +class LibraryBridgeHelper : public IBridgeHelper +{ +protected: + explicit LibraryBridgeHelper(ContextPtr context_); + + void startBridge(std::unique_ptr cmd) const override; + + String serviceAlias() const override { return "clickhouse-library-bridge"; } + + String serviceFileName() const override { return serviceAlias(); } + + size_t getDefaultPort() const override { return DEFAULT_PORT; } + + bool startBridgeManually() const override { return false; } + + String configPrefix() const override { return "library_bridge"; } + + const Poco::Util::AbstractConfiguration & getConfig() const override { return config; } + + Poco::Logger * getLog() const override { return log; } + + Poco::Timespan getHTTPTimeout() const override { return http_timeout; } + + Poco::URI createBaseURI() const override; + + static constexpr inline size_t DEFAULT_PORT = 9012; + + const Poco::Util::AbstractConfiguration & config; + Poco::Logger * log; + const Poco::Timespan http_timeout; + std::string bridge_host; + size_t bridge_port; + ConnectionTimeouts http_timeouts; + Poco::Net::HTTPBasicCredentials credentials{}; +}; + +} From 9952ab10991ffcdcb119bf76d47de670d2cb0286 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 5 Aug 2022 07:57:47 +0000 Subject: [PATCH 074/164] Prefix class names "LibraryBridge*Handler" with "ExternalDictionary" - necessary to disambiguate the names from "CatBoost"-"LibraryBridgeHandler" which will be added in a next step --- .../library-bridge/LibraryBridgeHandlerFactory.cpp | 4 ++-- programs/library-bridge/LibraryBridgeHandlers.cpp | 12 ++++++------ programs/library-bridge/LibraryBridgeHandlers.h | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/programs/library-bridge/LibraryBridgeHandlerFactory.cpp b/programs/library-bridge/LibraryBridgeHandlerFactory.cpp index 1285be6a07d..f8f6a23e1be 100644 --- a/programs/library-bridge/LibraryBridgeHandlerFactory.cpp +++ b/programs/library-bridge/LibraryBridgeHandlerFactory.cpp @@ -26,13 +26,13 @@ std::unique_ptr LibraryBridgeHandlerFactory::createRequestHa if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) { if (uri.getPath() == "/extdict_ping") - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(keep_alive_timeout, getContext()); } if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST) { if (uri.getPath() == "/extdict_request") - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(keep_alive_timeout, getContext()); } return nullptr; diff --git a/programs/library-bridge/LibraryBridgeHandlers.cpp b/programs/library-bridge/LibraryBridgeHandlers.cpp index 9537251954c..50fef846797 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.cpp +++ b/programs/library-bridge/LibraryBridgeHandlers.cpp @@ -78,14 +78,14 @@ static void writeData(Block data, OutputFormatPtr format) executor.execute(); } -LibraryBridgeRequestHandler::LibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_) +ExternalDictionaryLibraryBridgeRequestHandler::ExternalDictionaryLibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_) : WithContext(context_) - , log(&Poco::Logger::get("LibraryBridgeRequestHandler")) + , log(&Poco::Logger::get("ExternalDictionaryLibraryBridgeRequestHandler")) , keep_alive_timeout(keep_alive_timeout_) { } -void LibraryBridgeRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) +void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { LOG_TRACE(log, "Request URI: {}", request.getURI()); HTMLForm params(getContext()->getSettingsRef(), request); @@ -340,14 +340,14 @@ void LibraryBridgeRequestHandler::handleRequest(HTTPServerRequest & request, HTT } } -LibraryBridgeExistsHandler::LibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_) +ExternalDictionaryLibraryBridgeExistsHandler::ExternalDictionaryLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_) : WithContext(context_) , keep_alive_timeout(keep_alive_timeout_) - , log(&Poco::Logger::get("LibraryBridgeExistsHandler")) + , log(&Poco::Logger::get("ExternalDictionaryLibraryBridgeExistsHandler")) { } -void LibraryBridgeExistsHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) +void ExternalDictionaryLibraryBridgeExistsHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { try { diff --git a/programs/library-bridge/LibraryBridgeHandlers.h b/programs/library-bridge/LibraryBridgeHandlers.h index 454bcc46acc..63053c62800 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.h +++ b/programs/library-bridge/LibraryBridgeHandlers.h @@ -17,10 +17,10 @@ namespace DB /// names of dictionary attributes, sample block to parse block of null values, block of null values. Everything is /// passed in binary format and is urlencoded. When dictionary is cloned, a new handler is created. /// Each handler is unique to dictionary. -class LibraryBridgeRequestHandler : public HTTPRequestHandler, WithContext +class ExternalDictionaryLibraryBridgeRequestHandler : public HTTPRequestHandler, WithContext { public: - LibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_); + ExternalDictionaryLibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; @@ -32,10 +32,10 @@ private: }; -class LibraryBridgeExistsHandler : public HTTPRequestHandler, WithContext +class ExternalDictionaryLibraryBridgeExistsHandler : public HTTPRequestHandler, WithContext { public: - LibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_); + ExternalDictionaryLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; From d609da182b302dc9349ed4012fb5320d2992dc02 Mon Sep 17 00:00:00 2001 From: HarryLeeIBM Date: Mon, 8 Aug 2022 11:28:35 -0700 Subject: [PATCH 075/164] Fix Endian issue in Codec for s390x --- .../CompressionCodecDoubleDelta.cpp | 24 +++++++++---------- src/Compression/CompressionCodecGorilla.cpp | 16 ++++++------- src/Compression/ICompressionCodec.cpp | 8 +++---- .../tests/gtest_compressionCodec.cpp | 6 ++--- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/Compression/CompressionCodecDoubleDelta.cpp b/src/Compression/CompressionCodecDoubleDelta.cpp index 89004cec057..017c82701f5 100644 --- a/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/src/Compression/CompressionCodecDoubleDelta.cpp @@ -292,7 +292,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) const char * dest_start = dest; const UInt32 items_count = source_size / sizeof(ValueType); - unalignedStore(dest, items_count); + unalignedStoreLE(dest, items_count); dest += sizeof(items_count); ValueType prev_value{}; @@ -300,8 +300,8 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) if (source < source_end) { - prev_value = unalignedLoad(source); - unalignedStore(dest, prev_value); + prev_value = unalignedLoadLE(source); + unalignedStoreLE(dest, prev_value); source += sizeof(prev_value); dest += sizeof(prev_value); @@ -309,10 +309,10 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) if (source < source_end) { - const ValueType curr_value = unalignedLoad(source); + const ValueType curr_value = unalignedLoadLE(source); prev_delta = curr_value - prev_value; - unalignedStore(dest, prev_delta); + unalignedStoreLE(dest, prev_delta); source += sizeof(curr_value); dest += sizeof(prev_delta); @@ -324,7 +324,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) int item = 2; for (; source < source_end; source += sizeof(ValueType), ++item) { - const ValueType curr_value = unalignedLoad(source); + const ValueType curr_value = unalignedLoadLE(source); const UnsignedDeltaType delta = curr_value - prev_value; const UnsignedDeltaType double_delta = delta - prev_delta; @@ -368,7 +368,7 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest, if (source + sizeof(UInt32) > source_end) return; - const UInt32 items_count = unalignedLoad(source); + const UInt32 items_count = unalignedLoadLE(source); source += sizeof(items_count); ValueType prev_value{}; @@ -378,10 +378,10 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest, if (source + sizeof(ValueType) > source_end || items_count < 1) return; - prev_value = unalignedLoad(source); + prev_value = unalignedLoadLE(source); if (dest + sizeof(prev_value) > output_end) throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data"); - unalignedStore(dest, prev_value); + unalignedStoreLE(dest, prev_value); source += sizeof(prev_value); dest += sizeof(prev_value); @@ -390,11 +390,11 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest, if (source + sizeof(UnsignedDeltaType) > source_end || items_count < 2) return; - prev_delta = unalignedLoad(source); + prev_delta = unalignedLoadLE(source); prev_value = prev_value + static_cast(prev_delta); if (dest + sizeof(prev_value) > output_end) throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data"); - unalignedStore(dest, prev_value); + unalignedStoreLE(dest, prev_value); source += sizeof(prev_delta); dest += sizeof(prev_value); @@ -427,7 +427,7 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest, const ValueType curr_value = prev_value + delta; if (dest + sizeof(curr_value) > output_end) throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data"); - unalignedStore(dest, curr_value); + unalignedStoreLE(dest, curr_value); dest += sizeof(curr_value); prev_delta = curr_value - prev_value; diff --git a/src/Compression/CompressionCodecGorilla.cpp b/src/Compression/CompressionCodecGorilla.cpp index 4631a666830..0ca3e5660e0 100644 --- a/src/Compression/CompressionCodecGorilla.cpp +++ b/src/Compression/CompressionCodecGorilla.cpp @@ -208,7 +208,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest, const UInt32 items_count = source_size / sizeof(T); - unalignedStore(dest, items_count); + unalignedStoreLE(dest, items_count); dest += sizeof(items_count); T prev_value{}; @@ -217,8 +217,8 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest, if (source < source_end) { - prev_value = unalignedLoad(source); - unalignedStore(dest, prev_value); + prev_value = unalignedLoadLE(source); + unalignedStoreLE(dest, prev_value); source += sizeof(prev_value); dest += sizeof(prev_value); @@ -228,7 +228,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest, while (source < source_end) { - const T curr_value = unalignedLoad(source); + const T curr_value = unalignedLoadLE(source); source += sizeof(curr_value); const auto xored_data = curr_value ^ prev_value; @@ -274,7 +274,7 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest) if (source + sizeof(UInt32) > source_end) return; - const UInt32 items_count = unalignedLoad(source); + const UInt32 items_count = unalignedLoadLE(source); source += sizeof(items_count); T prev_value{}; @@ -283,8 +283,8 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest) if (source + sizeof(T) > source_end || items_count < 1) return; - prev_value = unalignedLoad(source); - unalignedStore(dest, prev_value); + prev_value = unalignedLoadLE(source); + unalignedStoreLE(dest, prev_value); source += sizeof(prev_value); dest += sizeof(prev_value); @@ -326,7 +326,7 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest) } // else: 0b0 prefix - use prev_value - unalignedStore(dest, curr_value); + unalignedStoreLE(dest, curr_value); dest += sizeof(curr_value); prev_xored_info = curr_xored_info; diff --git a/src/Compression/ICompressionCodec.cpp b/src/Compression/ICompressionCodec.cpp index c48ca99d452..4c2b95682b3 100644 --- a/src/Compression/ICompressionCodec.cpp +++ b/src/Compression/ICompressionCodec.cpp @@ -86,8 +86,8 @@ UInt32 ICompressionCodec::compress(const char * source, UInt32 source_size, char UInt8 header_size = getHeaderSize(); /// Write data from header_size UInt32 compressed_bytes_written = doCompressData(source, source_size, &dest[header_size]); - unalignedStore(&dest[1], compressed_bytes_written + header_size); - unalignedStore(&dest[5], source_size); + unalignedStoreLE(&dest[1], compressed_bytes_written + header_size); + unalignedStoreLE(&dest[5], source_size); return header_size + compressed_bytes_written; } @@ -112,7 +112,7 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch UInt32 ICompressionCodec::readCompressedBlockSize(const char * source) { - UInt32 compressed_block_size = unalignedLoad(&source[1]); + UInt32 compressed_block_size = unalignedLoadLE(&source[1]); if (compressed_block_size == 0) throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: header is corrupt with compressed block size 0"); return compressed_block_size; @@ -121,7 +121,7 @@ UInt32 ICompressionCodec::readCompressedBlockSize(const char * source) UInt32 ICompressionCodec::readDecompressedBlockSize(const char * source) { - UInt32 decompressed_block_size = unalignedLoad(&source[5]); + UInt32 decompressed_block_size = unalignedLoadLE(&source[5]); if (decompressed_block_size == 0) throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: header is corrupt with decompressed block size 0"); return decompressed_block_size; diff --git a/src/Compression/tests/gtest_compressionCodec.cpp b/src/Compression/tests/gtest_compressionCodec.cpp index 77050908265..b634a3137cd 100644 --- a/src/Compression/tests/gtest_compressionCodec.cpp +++ b/src/Compression/tests/gtest_compressionCodec.cpp @@ -175,7 +175,7 @@ private: throw std::runtime_error("No more data to read"); } - current_value = unalignedLoad(data); + current_value = unalignedLoadLE(data); data = reinterpret_cast(data) + sizeof(T); } }; @@ -371,7 +371,7 @@ CodecTestSequence makeSeq(Args && ... args) char * write_pos = data.data(); for (const auto & v : vals) { - unalignedStore(write_pos, v); + unalignedStoreLE(write_pos, v); write_pos += sizeof(v); } @@ -393,7 +393,7 @@ CodecTestSequence generateSeq(Generator gen, const char* gen_name, B Begin = 0, { const T v = gen(static_cast(i)); - unalignedStore(write_pos, v); + unalignedStoreLE(write_pos, v); write_pos += sizeof(v); } From e0d5020a92d0502bd915ff4dd59c4126fcbbd87b Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 Aug 2022 14:40:19 +0000 Subject: [PATCH 076/164] Add simple versioning to the *-bridge-to-server protocol - In general, it is expected that clickhouse-*-bridges and clickhouse-server were build from the same source version (e.g. are upgraded "atomically"). If that is not the case, we should at least be able to detect the mismatch and abort. - This commit adds a URL parameter "version", defined in a header shared by the server and bridges. The bridge returns an error in case of mismatch. - The version is *not* send and checked for "ping" requests (used for handshake), only for regular requests send after handshake. This is because the internally thrown server-side exception due to HTTP failure does not propagate the exact HTTP error (it only stores the error as text), and as a result, the server-side handshake code simply retries in case of error with exponential backoff and finally fails with a "timeout error". This is reasonable as pings typically fail due to time out. However, without a rework of HTTP exceptions, version mismatch during ping would also appear as "timeout" which is too misleading. The behavior may be changed later if needed. - Note that introducing a version parameter does not represent a protocol upgrade itself. Bridges older than the server will simply ignore the field. Only servers older than the bridges receive an error but such a situation should never occur in practice. --- .../library-bridge/LibraryBridgeHandlers.cpp | 26 +++++++++++++++++++ .../library-bridge/LibraryBridgeHandlers.h | 1 - programs/odbc-bridge/ColumnInfoHandler.cpp | 26 +++++++++++++++++++ .../odbc-bridge/IdentifierQuoteHandler.cpp | 26 +++++++++++++++++++ programs/odbc-bridge/MainHandler.cpp | 25 ++++++++++++++++++ programs/odbc-bridge/SchemaAllowedHandler.cpp | 26 +++++++++++++++++++ .../ExternalDictionaryLibraryBridgeHelper.cpp | 1 + src/BridgeHelper/LibraryBridgeHelper.h | 1 + src/BridgeHelper/XDBCBridgeHelper.h | 5 ++++ src/Common/BridgeProtocolVersion.h | 14 ++++++++++ 10 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 src/Common/BridgeProtocolVersion.h diff --git a/programs/library-bridge/LibraryBridgeHandlers.cpp b/programs/library-bridge/LibraryBridgeHandlers.cpp index 50fef846797..bdb5f3ca02b 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.cpp +++ b/programs/library-bridge/LibraryBridgeHandlers.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include namespace DB @@ -90,6 +92,29 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ LOG_TRACE(log, "Request URI: {}", request.getURI()); HTMLForm params(getContext()->getSettingsRef(), request); + if (!params.has("version")) + { + processError(response, "No 'version' in request URL"); + return; + } + else + { + String version_str = params.get("version"); + size_t version; + auto [_, ec] = std::from_chars(version_str.data(), version_str.data() + version_str.size(), version); + if (ec != std::errc()) + { + processError(response, "Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); + return; + } + if (version != LIBRARY_BRIDGE_PROTOCOL_VERSION) + { + // backwards compatibility is for now deemed unnecessary, just let the user upgrade the server and bridge to the same version + processError(response, "Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; + } + } + if (!params.has("method")) { processError(response, "No 'method' in request URL"); @@ -361,6 +386,7 @@ void ExternalDictionaryLibraryBridgeExistsHandler::handleRequest(HTTPServerReque } std::string dictionary_id = params.get("dictionary_id"); + auto library_handler = ExternalDictionaryLibraryHandlerFactory::instance().get(dictionary_id); String res = library_handler ? "1" : "0"; diff --git a/programs/library-bridge/LibraryBridgeHandlers.h b/programs/library-bridge/LibraryBridgeHandlers.h index 63053c62800..b20f40616ce 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.h +++ b/programs/library-bridge/LibraryBridgeHandlers.h @@ -9,7 +9,6 @@ namespace DB { - /// Handler for requests to Library Dictionary Source, returns response in RowBinary format. /// When a library dictionary source is created, it sends 'extDict_libNew' request to library bridge (which is started on first /// request to it, if it was not yet started). On this request a new sharedLibrayHandler is added to a diff --git a/programs/odbc-bridge/ColumnInfoHandler.cpp b/programs/odbc-bridge/ColumnInfoHandler.cpp index f043c832aa4..7d2f6f57d34 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.cpp +++ b/programs/odbc-bridge/ColumnInfoHandler.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "getIdentifierQuote.h" #include "validateODBCConnectionString.h" @@ -22,6 +23,8 @@ #include #include +#include + namespace DB { @@ -80,6 +83,29 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ LOG_WARNING(log, fmt::runtime(message)); }; + if (!params.has("version")) + { + process_error("No 'version' in request URL"); + return; + } + else + { + String version_str = params.get("version"); + size_t version; + auto [_, ec] = std::from_chars(version_str.data(), version_str.data() + version_str.size(), version); + if (ec != std::errc()) + { + process_error("Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); + return; + } + if (version != XDBC_BRIDGE_PROTOCOL_VERSION) + { + // backwards compatibility is for now deemed unnecessary, just let the user upgrade the server and bridge to the same version + process_error("Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; + } + } + if (!params.has("table")) { process_error("No 'table' param in request URL"); diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.cpp b/programs/odbc-bridge/IdentifierQuoteHandler.cpp index c3a88804a51..b162a2ea324 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.cpp +++ b/programs/odbc-bridge/IdentifierQuoteHandler.cpp @@ -10,12 +10,15 @@ #include #include #include +#include #include #include #include "getIdentifierQuote.h" #include "validateODBCConnectionString.h" #include "ODBCPooledConnectionFactory.h" +#include + namespace DB { @@ -32,6 +35,29 @@ void IdentifierQuoteHandler::handleRequest(HTTPServerRequest & request, HTTPServ LOG_WARNING(log, fmt::runtime(message)); }; + if (!params.has("version")) + { + process_error("No 'version' in request URL"); + return; + } + else + { + String version_str = params.get("version"); + size_t version; + auto [_, ec] = std::from_chars(version_str.data(), version_str.data() + version_str.size(), version); + if (ec != std::errc()) + { + process_error("Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); + return; + } + if (version != XDBC_BRIDGE_PROTOCOL_VERSION) + { + // backwards compatibility is for now deemed unnecessary, just let the user upgrade the server and bridge to the same version + process_error("Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; + } + } + if (!params.has("connection_string")) { process_error("No 'connection_string' in request URL"); diff --git a/programs/odbc-bridge/MainHandler.cpp b/programs/odbc-bridge/MainHandler.cpp index bb1cf53205d..6ece12198d3 100644 --- a/programs/odbc-bridge/MainHandler.cpp +++ b/programs/odbc-bridge/MainHandler.cpp @@ -17,10 +17,12 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -55,6 +57,29 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse HTMLForm params(getContext()->getSettingsRef(), request); LOG_TRACE(log, "Request URI: {}", request.getURI()); + if (!params.has("version")) + { + processError(response, "No 'version' in request URL"); + return; + } + else + { + String version_str = params.get("version"); + size_t version; + auto [_, ec] = std::from_chars(version_str.data(), version_str.data() + version_str.size(), version); + if (ec != std::errc()) + { + processError(response, "Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); + return; + } + if (version != XDBC_BRIDGE_PROTOCOL_VERSION) + { + // backwards compatibility is for now deemed unnecessary, just let the user upgrade the server and bridge to the same version + processError(response, "Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; + } + } + if (mode == "read") params.read(request.getStream()); diff --git a/programs/odbc-bridge/SchemaAllowedHandler.cpp b/programs/odbc-bridge/SchemaAllowedHandler.cpp index 54dfa20f808..f70474fc898 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.cpp +++ b/programs/odbc-bridge/SchemaAllowedHandler.cpp @@ -7,12 +7,15 @@ #include #include #include +#include #include #include "validateODBCConnectionString.h" #include "ODBCPooledConnectionFactory.h" #include #include +#include + namespace DB { @@ -40,6 +43,29 @@ void SchemaAllowedHandler::handleRequest(HTTPServerRequest & request, HTTPServer LOG_WARNING(log, fmt::runtime(message)); }; + if (!params.has("version")) + { + process_error("No 'version' in request URL"); + return; + } + else + { + String version_str = params.get("version"); + size_t version; + auto [_, ec] = std::from_chars(version_str.data(), version_str.data() + version_str.size(), version); + if (ec != std::errc()) + { + process_error("Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); + return; + } + if (version != XDBC_BRIDGE_PROTOCOL_VERSION) + { + // backwards compatibility is for now deemed unnecessary, just let the user upgrade the server and bridge to the same version + process_error("Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; + } + } + if (!params.has("connection_string")) { process_error("No 'connection_string' in request URL"); diff --git a/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp b/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp index 28425cfdb31..4f943e47fa9 100644 --- a/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp +++ b/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp @@ -59,6 +59,7 @@ Poco::URI ExternalDictionaryLibraryBridgeHelper::getMainURI() const Poco::URI ExternalDictionaryLibraryBridgeHelper::createRequestURI(const String & method) const { auto uri = getMainURI(); + uri.addQueryParameter("version", std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION)); uri.addQueryParameter("dictionary_id", toString(dictionary_id)); uri.addQueryParameter("method", method); return uri; diff --git a/src/BridgeHelper/LibraryBridgeHelper.h b/src/BridgeHelper/LibraryBridgeHelper.h index a00421570fe..447a4c713f4 100644 --- a/src/BridgeHelper/LibraryBridgeHelper.h +++ b/src/BridgeHelper/LibraryBridgeHelper.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { diff --git a/src/BridgeHelper/XDBCBridgeHelper.h b/src/BridgeHelper/XDBCBridgeHelper.h index 84aa73ef8e5..f2cd76d5d81 100644 --- a/src/BridgeHelper/XDBCBridgeHelper.h +++ b/src/BridgeHelper/XDBCBridgeHelper.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,7 @@ protected: { auto uri = createBaseURI(); uri.setPath(MAIN_HANDLER); + uri.addQueryParameter("version", std::to_string(XDBC_BRIDGE_PROTOCOL_VERSION)); return uri; } @@ -163,6 +165,7 @@ protected: { auto uri = createBaseURI(); uri.setPath(COL_INFO_HANDLER); + uri.addQueryParameter("version", std::to_string(XDBC_BRIDGE_PROTOCOL_VERSION)); return uri; } @@ -184,6 +187,7 @@ protected: auto uri = createBaseURI(); uri.setPath(SCHEMA_ALLOWED_HANDLER); + uri.addQueryParameter("version", std::to_string(XDBC_BRIDGE_PROTOCOL_VERSION)); uri.addQueryParameter("connection_string", getConnectionString()); ReadWriteBufferFromHTTP buf(uri, Poco::Net::HTTPRequest::HTTP_POST, {}, ConnectionTimeouts::getHTTPTimeouts(getContext()), credentials); @@ -204,6 +208,7 @@ protected: auto uri = createBaseURI(); uri.setPath(IDENTIFIER_QUOTE_HANDLER); + uri.addQueryParameter("version", std::to_string(XDBC_BRIDGE_PROTOCOL_VERSION)); uri.addQueryParameter("connection_string", getConnectionString()); ReadWriteBufferFromHTTP buf(uri, Poco::Net::HTTPRequest::HTTP_POST, {}, ConnectionTimeouts::getHTTPTimeouts(getContext()), credentials); diff --git a/src/Common/BridgeProtocolVersion.h b/src/Common/BridgeProtocolVersion.h new file mode 100644 index 00000000000..77d094fce3f --- /dev/null +++ b/src/Common/BridgeProtocolVersion.h @@ -0,0 +1,14 @@ +#pragma once + +#include + +namespace DB +{ + +// Version of protocol between clickhouse-server and clickhouse-library-bridge. Increment if you change it in a non-compatible way. +static constexpr size_t LIBRARY_BRIDGE_PROTOCOL_VERSION = 1; + +// Version of protocol between clickhouse-server and clickhouse-xdbc-bridge. Increment if you change it in a non-compatible way. +static constexpr size_t XDBC_BRIDGE_PROTOCOL_VERSION = 1; + +} From eb61db3b6783380dbdecca3622bd41034acc6f97 Mon Sep 17 00:00:00 2001 From: Tanya Bragin Date: Mon, 8 Aug 2022 19:32:34 -0700 Subject: [PATCH 077/164] initial changes to close #37492 --- .../functions/type-conversion-functions.md | 14 +++++++++- .../functions/type-conversion-functions.md | 16 ++++++++++-- src/Functions/FunctionsConversion.cpp | 3 +++ src/Functions/FunctionsConversion.h | 26 ++++++++++++++++--- src/IO/parseDateTimeBestEffort.cpp | 5 ++++ src/IO/parseDateTimeBestEffort.h | 5 ++-- ...arse_date_time_64_best_effort_us.reference | 12 +++++++++ ...2381_parse_date_time_64_best_effort_us.sql | 22 ++++++++++++++++ 8 files changed, 94 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.reference create mode 100644 tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.sql diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index c51445675f5..51bec17ab94 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1458,7 +1458,19 @@ Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort) except that ## parseDateTime64BestEffortOrZero -Same as for [parseDateTime64BestEffort](#parsedatetimebesteffort) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. +Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. + +## parseDateTime64BestEffortUS + +Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity. + +## parseDateTime64BestEffortUSOrNull + +Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity and returns `NULL` when it encounters a date format that cannot be processed. + +## parseDateTime64BestEffortUSOrZero + +Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity and returns zero date or zero date time when it encounters a date format that cannot be processed. ## toLowCardinality diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 946abddf3d0..efb9c5c623a 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -1374,13 +1374,25 @@ FORMAT PrettyCompactMonoBlock; └────────────────────────────┴────────────────────────────────┘ ``` -## parseDateTime64BestEffortOrNull {#parsedatetime32besteffortornull} +## parseDateTime64BestEffortOrNull {#parsedatetime64besteffortornull} Работает аналогично функции [parseDateTime64BestEffort](#parsedatetime64besteffort), но возвращает `NULL`, если формат даты не может быть обработан. ## parseDateTime64BestEffortOrZero {#parsedatetime64besteffortorzero} -Работает аналогично функции [parseDateTime64BestEffort](#parsedatetimebesteffort), но возвращает нулевую дату и время, если формат даты не может быть обработан. +Работает аналогично функции [parseDateTime64BestEffort](#parsedatetime64besteffort), но возвращает нулевую дату и время, если формат даты не может быть обработан. + +## parseDateTime64BestEffortUS {#parsedatetime64besteffortus} + +Работает аналогично функции [parseDateTime64BestEffort](#parsedatetime64besteffort), но разница состоит в том, что в она предполагает американский формат даты (`MM/DD/YYYY` etc.) в случае неоднозначности. + +## parseDateTime64BestEffortUSOrNull {#parsedatetime64besteffortusornull} + +Работает аналогично функции [parseDateTime64BestEffort](#parsedatetime64besteffort), но разница состоит в том, что в она предполагает американский формат даты (`MM/DD/YYYY` etc.) в случае неоднозначности и возвращает `NULL`, если формат даты не может быть обработан. + +## parseDateTime64BestEffortUSOrZero {#parsedatetime64besteffortusorzero} + +Работает аналогично функции [parseDateTime64BestEffort](#parsedatetime64besteffort), но разница состоит в том, что в она предполагает американский формат даты (`MM/DD/YYYY` etc.) в случае неоднозначности и возвращает нулевую дату и время, если формат даты не может быть обработан. ## toLowCardinality {#tolowcardinality} diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index ae6ad0a6034..dc0235f810f 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -103,6 +103,9 @@ REGISTER_FUNCTION(Conversion) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); factory.registerFunction>(); factory.registerFunction>(); diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 014ce98a795..1387c7af2b8 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -1336,9 +1336,18 @@ struct ConvertThroughParsing } else if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffortUS) { - time_t res; - parsed = tryParseDateTimeBestEffortUS(res, read_buffer, *local_time_zone, *utc_time_zone); - convertFromTime(vec_to[i],res); + if constexpr (to_datetime64) + { + DateTime64 res = 0; + parsed = tryParseDateTime64BestEffortUS(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); + vec_to[i] = res; + } + else + { + time_t res; + parsed = tryParseDateTimeBestEffortUS(res, read_buffer, *local_time_zone, *utc_time_zone); + convertFromTime(vec_to[i],res); + } } else { @@ -2525,6 +2534,9 @@ struct NameParseDateTime32BestEffortOrNull { static constexpr auto name = "parse struct NameParseDateTime64BestEffort { static constexpr auto name = "parseDateTime64BestEffort"; }; struct NameParseDateTime64BestEffortOrZero { static constexpr auto name = "parseDateTime64BestEffortOrZero"; }; struct NameParseDateTime64BestEffortOrNull { static constexpr auto name = "parseDateTime64BestEffortOrNull"; }; +struct NameParseDateTime64BestEffortUS { static constexpr auto name = "parseDateTime64BestEffortUS"; }; +struct NameParseDateTime64BestEffortUSOrZero { static constexpr auto name = "parseDateTime64BestEffortUSOrZero"; }; +struct NameParseDateTime64BestEffortUSOrNull { static constexpr auto name = "parseDateTime64BestEffortUSOrNull"; }; using FunctionParseDateTimeBestEffort = FunctionConvertFromString< @@ -2555,6 +2567,14 @@ using FunctionParseDateTime64BestEffortOrZero = FunctionConvertFromString< using FunctionParseDateTime64BestEffortOrNull = FunctionConvertFromString< DataTypeDateTime64, NameParseDateTime64BestEffortOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffort>; +using FunctionParseDateTime64BestEffortUS = FunctionConvertFromString< + DataTypeDateTime64, NameParseDateTime64BestEffortUS, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffortUS>; +using FunctionParseDateTime64BestEffortUSOrZero = FunctionConvertFromString< + DataTypeDateTime64, NameParseDateTime64BestEffortUSOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffortUS>; +using FunctionParseDateTime64BestEffortUSOrNull = FunctionConvertFromString< + DataTypeDateTime64, NameParseDateTime64BestEffortUSOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffortUS>; + + class ExecutableFunctionCast : public IExecutableFunction { public: diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp index 3c6f9b8f9f5..df3f02708a9 100644 --- a/src/IO/parseDateTimeBestEffort.cpp +++ b/src/IO/parseDateTimeBestEffort.cpp @@ -697,4 +697,9 @@ bool tryParseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & i return parseDateTime64BestEffortImpl(res, scale, in, local_time_zone, utc_time_zone); } +bool tryParseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone) +{ + return parseDateTime64BestEffortImpl(res, scale, in, local_time_zone, utc_time_zone); +} + } diff --git a/src/IO/parseDateTimeBestEffort.h b/src/IO/parseDateTimeBestEffort.h index fe3da24a797..22af44f9e76 100644 --- a/src/IO/parseDateTimeBestEffort.h +++ b/src/IO/parseDateTimeBestEffort.h @@ -27,7 +27,6 @@ class ReadBuffer; * * DD/MM/YY * DD/MM/YYYY - when '/' separator is used, these are the only possible forms - * Note that American style is not supported. * * hh:mm:ss - when ':' separator is used, it is always time * hh:mm - it can be specified without seconds @@ -61,7 +60,7 @@ bool tryParseDateTimeBestEffort(time_t & res, ReadBuffer & in, const DateLUTImpl void parseDateTimeBestEffortUS(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone); bool tryParseDateTimeBestEffortUS(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone); void parseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone); -void parseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone); bool tryParseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone); - +void parseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone); +bool tryParseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone); } diff --git a/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.reference b/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.reference new file mode 100644 index 00000000000..be9aaafc5db --- /dev/null +++ b/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.reference @@ -0,0 +1,12 @@ +parseDateTime64BestEffortUS + s a + + 01-02-1930 12:00:00 1930-01-02 12:00:00.000 + 12.02.1930 12:00:00 1930-12-02 12:00:00.000 + 13/02/1930 12:00:00 1930-02-13 12:00:00.000 + 02/25/1930 12:00:00 1930-02-25 12:00:00.000 + +parseDateTime64BestEffortUSOrNull +\N +parseDateTime64BestEffortUSOrZero +1969-12-31 16:00:00.000 diff --git a/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.sql b/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.sql new file mode 100644 index 00000000000..20c0e7ef570 --- /dev/null +++ b/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.sql @@ -0,0 +1,22 @@ +SELECT 'parseDateTime64BestEffortUS'; + +SELECT + s, + parseDateTime64BestEffortUS(s) AS a +FROM +( + SELECT arrayJoin([ +'01-02-1930 12:00:00', +'12.02.1930 12:00:00', +'13/02/1930 12:00:00', +'02/25/1930 12:00:00' +]) AS s) +FORMAT PrettySpaceNoEscapes; + +SELECT ''; + +SELECT 'parseDateTime64BestEffortUSOrNull'; +SELECT parseDateTime64BestEffortUSOrNull('01/45/1925 16:00:00'); + +SELECT 'parseDateTime64BestEffortUSOrZero'; +SELECT parseDateTime64BestEffortUSOrZero('01/45/1925 16:00:00'); From 78cc2a970d055530664445a1bd9876483ac53a1b Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> Date: Tue, 9 Aug 2022 02:40:17 -0400 Subject: [PATCH 078/164] copy self-extracting to output --- docker/packager/binary/build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index fb2b08a2633..c2de0e33d82 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -104,6 +104,7 @@ if [ -n "$MAKE_DEB" ]; then fi mv ./programs/clickhouse* /output +[ -x ./programs/self-extracting/clickhouse ] && mv ./programs/self-extracting/clickhouse /output mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds find . -name '*.so' -print -exec mv '{}' /output \; find . -name '*.so.*' -print -exec mv '{}' /output \; From 96598e35743aa83e3ac31cc4776190543671f1d1 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 9 Aug 2022 08:52:02 +0000 Subject: [PATCH 079/164] Replace normalizeDayNum() by std::clamp() --- src/Common/DateLUTImpl.h | 9 --------- src/Functions/FunctionDateOrDateTimeAddInterval.h | 4 ++-- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index fbbd29d7a2b..f9361e809a9 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -340,15 +340,6 @@ public: return ExtendedDayNum{static_cast(toLUTIndex(v).toUnderType() - daynum_offset_epoch)}; } - static UInt16 normalizeDayNum(Int64 d) - { - if (d < 0) - return 0; - if (d > 65535) - return 65535; - return static_cast(d); - } - /// Round down to start of monday. template inline Time toFirstDayOfWeek(DateOrTime v) const diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h index 341a7cf504d..bf2f530cb7f 100644 --- a/src/Functions/FunctionDateOrDateTimeAddInterval.h +++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h @@ -288,7 +288,7 @@ struct AddDaysImpl static inline NO_SANITIZE_UNDEFINED UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl &, UInt16 = 0) { - return DateLUT::instance().normalizeDayNum(d + delta); + return static_cast(std::clamp(d + delta, 0L, 65535L)); } static inline NO_SANITIZE_UNDEFINED Int32 execute(Int32 d, Int64 delta, const DateLUTImpl &, UInt16 = 0) @@ -322,7 +322,7 @@ struct AddWeeksImpl static inline NO_SANITIZE_UNDEFINED UInt16 execute(UInt16 d, Int32 delta, const DateLUTImpl &, UInt16 = 0) { - return DateLUT::instance().normalizeDayNum(d + delta * 7); + return static_cast(std::clamp(d + delta * 7, 0, 65535)); } static inline NO_SANITIZE_UNDEFINED Int32 execute(Int32 d, Int32 delta, const DateLUTImpl &, UInt16 = 0) From 901e38091a2c2b7e4fe8b46e8594c12ff19b2abb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 9 Aug 2022 10:58:29 +0200 Subject: [PATCH 080/164] Add no-backward-compatibility-check for the crasher --- tests/queries/0_stateless/02293_hashid.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02293_hashid.sql b/tests/queries/0_stateless/02293_hashid.sql index 173ec2789c2..9938154f169 100644 --- a/tests/queries/0_stateless/02293_hashid.sql +++ b/tests/queries/0_stateless/02293_hashid.sql @@ -1,3 +1,4 @@ +-- Tags: no-backward-compatibility-check SET allow_experimental_hash_functions = 1; select number, hashid(number) from system.numbers limit 5; From 4c102adaf342ba80c7b335b69e988e66a30cf792 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 9 Aug 2022 11:39:40 +0000 Subject: [PATCH 081/164] Fix deadlock with msan --- contrib/arrow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/arrow b/contrib/arrow index 611e4a63107..b41ff445294 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 611e4a631072d822074f6ea119a2b8d20c8760ca +Subproject commit b41ff4452944d50a44ad9c6e4621b50f44e9742e From f7012e378233838f49fa3ba35ae5c7fb574492e6 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 9 Aug 2022 14:28:37 +0200 Subject: [PATCH 082/164] Update 02354_distributed_with_external_aggregation_memory_usage.sql --- ...02354_distributed_with_external_aggregation_memory_usage.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02354_distributed_with_external_aggregation_memory_usage.sql b/tests/queries/0_stateless/02354_distributed_with_external_aggregation_memory_usage.sql index ffd2e14fe50..548660e36b1 100644 --- a/tests/queries/0_stateless/02354_distributed_with_external_aggregation_memory_usage.sql +++ b/tests/queries/0_stateless/02354_distributed_with_external_aggregation_memory_usage.sql @@ -1,4 +1,4 @@ --- Tags: long, no-tsan, no-msan, no-asan, no-ubsan, no-debug +-- Tags: long, no-tsan, no-msan, no-asan, no-ubsan, no-debug, no-s3-storage create table t_2354_dist_with_external_aggr(a UInt64, b String, c FixedString(100)) engine = MergeTree order by tuple(); From 7c85b62e4ed0e7a701d78a88024e12120699643d Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 8 Aug 2022 09:10:01 +0000 Subject: [PATCH 083/164] Add setting type number with 'auto' --- src/Core/SettingsFields.cpp | 72 ++++++++++++++++++++++++++++++++----- src/Core/SettingsFields.h | 42 ++++++++++++++++++++-- 2 files changed, 103 insertions(+), 11 deletions(-) diff --git a/src/Core/SettingsFields.cpp b/src/Core/SettingsFields.cpp index d77a510d7f9..e3bf44d33a7 100644 --- a/src/Core/SettingsFields.cpp +++ b/src/Core/SettingsFields.cpp @@ -156,29 +156,85 @@ template struct SettingFieldNumber; namespace { - UInt64 stringToMaxThreads(const String & str) + bool isAutoSetting(const String & str) + { + return startsWith(str, "auto"); + } + + bool isAutoSetting(const Field & f) + { + return f.getType() == Field::Types::String && isAutoSetting(f.get()); + } + + template + T stringToNumberWithAuto(const String & str) { if (startsWith(str, "auto")) return 0; - return parseFromString(str); + return parseFromString(str); } - UInt64 fieldToMaxThreads(const Field & f) + template + T fieldToNumberWithAuto(const Field & f) { if (f.getType() == Field::Types::String) - return stringToMaxThreads(f.get()); + return stringToNumberWithAuto(f.get()); else - return applyVisitor(FieldVisitorConvertToNumber(), f); + return applyVisitor(FieldVisitorConvertToNumber(), f); } } -SettingFieldMaxThreads::SettingFieldMaxThreads(const Field & f) : SettingFieldMaxThreads(fieldToMaxThreads(f)) +template +SettingFieldNumberWithAuto::SettingFieldNumberWithAuto(const Field & f) + : is_auto(isAutoSetting(f)), value(fieldToNumberWithAuto(f)) +{ +} + +template +SettingFieldNumberWithAuto & SettingFieldNumberWithAuto::operator=(const Field & f) +{ + is_auto = isAutoSetting(f); + value = fieldToNumberWithAuto(f); + return *this; +} + +template +String SettingFieldNumberWithAuto::toString() const +{ + if (is_auto) + return "auto"; + else + return ::DB::toString(value); +} + +template +void SettingFieldNumberWithAuto::parseFromString(const String & str) +{ + is_auto = isAutoSetting(str); + value = stringToNumberWithAuto(str); +} + +template +void SettingFieldNumberWithAuto::writeBinary(WriteBuffer & out) const +{ + writeStringBinary(toString(), out); +} + +template +void SettingFieldNumberWithAuto::readBinary(ReadBuffer & in) +{ + String str; + readStringBinary(str, in); + parseFromString(str); +} + +SettingFieldMaxThreads::SettingFieldMaxThreads(const Field & f) : SettingFieldMaxThreads(fieldToNumberWithAuto(f)) { } SettingFieldMaxThreads & SettingFieldMaxThreads::operator=(const Field & f) { - *this = fieldToMaxThreads(f); + *this = fieldToNumberWithAuto(f); return *this; } @@ -192,7 +248,7 @@ String SettingFieldMaxThreads::toString() const void SettingFieldMaxThreads::parseFromString(const String & str) { - *this = stringToMaxThreads(str); + *this = stringToNumberWithAuto(str); } void SettingFieldMaxThreads::writeBinary(WriteBuffer & out) const diff --git a/src/Core/SettingsFields.h b/src/Core/SettingsFields.h index 20f2b34084e..f7946368fc5 100644 --- a/src/Core/SettingsFields.h +++ b/src/Core/SettingsFields.h @@ -59,10 +59,46 @@ using SettingFieldFloat = SettingFieldNumber; using SettingFieldBool = SettingFieldNumber; -/** Unlike SettingFieldUInt64, supports the value of 'auto' - the number of processor cores without taking into account SMT. - * A value of 0 is also treated as auto. - * When serializing, `auto` is written in the same way as 0. +/** Like SettingFieldNumber, but also supports the value of 'auto'. + * Note: 0 and 'auto' are not equal. By default when 'auto' is set the value is set to 0. + * But if you need to distinguish between 0 and 'auto', you can use the 'is_auto' flag. + * Serialized as string. */ +template +struct SettingFieldNumberWithAuto +{ + using Type = T; + + bool is_auto; + T value; + bool changed = false; + + explicit SettingFieldNumberWithAuto() : is_auto(true), value(0) {} + explicit SettingFieldNumberWithAuto(T x) : is_auto(false), value(x) {} + explicit SettingFieldNumberWithAuto(const Field & f); + + SettingFieldNumberWithAuto & operator=(T x) { is_auto = false; value = x; changed = true; return *this; } + SettingFieldNumberWithAuto & operator=(const Field & f); + + operator T() const { return value; } /// NOLINT + explicit operator Field() const { return is_auto ? Field("auto") : Field(value); } + + String toString() const; + void parseFromString(const String & str); + + void writeBinary(WriteBuffer & out) const; + void readBinary(ReadBuffer & in); + + T valueOr(T default_value) const { return is_auto ? default_value : value; } +}; + +using SettingFieldUInt64WithAuto = SettingFieldNumberWithAuto; +using SettingFieldInt64WithAuto = SettingFieldNumberWithAuto; + +/* Similar to SettingFieldNumberWithAuto with small differences to behave like regular UInt64, supported to compatibility. + * When setting to 'auto' it becames equal to the number of processor cores without taking into account SMT. + * A value of 0 is also treated as 'auto', so 'auto' is parsed and serialized in the same way as 0. + */ struct SettingFieldMaxThreads { bool is_auto; From 09e6b1a4976d3f7bb27ad9e85d0c046168e47ba9 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> Date: Tue, 9 Aug 2022 09:16:03 -0400 Subject: [PATCH 084/164] run clickhouse to decompress --- docker/test/fuzzer/run-fuzzer.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index f74760e3339..392d8110576 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -69,6 +69,8 @@ function download wget_with_retry "$BINARY_URL_TO_DOWNLOAD" chmod +x clickhouse + # clickhouse may be compressed - run once to decompress + ./clickhouse ||: ln -s ./clickhouse ./clickhouse-server ln -s ./clickhouse ./clickhouse-client From 3189c40e6d80478a0a91431f7dba37bc85c20427 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 9 Aug 2022 16:15:28 +0000 Subject: [PATCH 085/164] Implement SettingField with auto is a wrapper --- src/Core/SettingsFields.cpp | 75 +++++----------------------------- src/Core/SettingsFields.h | 81 +++++++++++++++++++++++++------------ 2 files changed, 66 insertions(+), 90 deletions(-) diff --git a/src/Core/SettingsFields.cpp b/src/Core/SettingsFields.cpp index e3bf44d33a7..86b20da9e8c 100644 --- a/src/Core/SettingsFields.cpp +++ b/src/Core/SettingsFields.cpp @@ -153,88 +153,35 @@ template struct SettingFieldNumber; template struct SettingFieldNumber; template struct SettingFieldNumber; +template struct SettingWithAuto>; +template struct SettingWithAuto>; +template struct SettingWithAuto>; namespace { - bool isAutoSetting(const String & str) - { - return startsWith(str, "auto"); - } - - bool isAutoSetting(const Field & f) - { - return f.getType() == Field::Types::String && isAutoSetting(f.get()); - } - - template - T stringToNumberWithAuto(const String & str) + UInt64 stringToMaxThreads(const String & str) { if (startsWith(str, "auto")) return 0; - return parseFromString(str); + return parseFromString(str); } - template - T fieldToNumberWithAuto(const Field & f) + UInt64 fieldToMaxThreads(const Field & f) { if (f.getType() == Field::Types::String) - return stringToNumberWithAuto(f.get()); + return stringToMaxThreads(f.get()); else - return applyVisitor(FieldVisitorConvertToNumber(), f); + return applyVisitor(FieldVisitorConvertToNumber(), f); } } -template -SettingFieldNumberWithAuto::SettingFieldNumberWithAuto(const Field & f) - : is_auto(isAutoSetting(f)), value(fieldToNumberWithAuto(f)) -{ -} - -template -SettingFieldNumberWithAuto & SettingFieldNumberWithAuto::operator=(const Field & f) -{ - is_auto = isAutoSetting(f); - value = fieldToNumberWithAuto(f); - return *this; -} - -template -String SettingFieldNumberWithAuto::toString() const -{ - if (is_auto) - return "auto"; - else - return ::DB::toString(value); -} - -template -void SettingFieldNumberWithAuto::parseFromString(const String & str) -{ - is_auto = isAutoSetting(str); - value = stringToNumberWithAuto(str); -} - -template -void SettingFieldNumberWithAuto::writeBinary(WriteBuffer & out) const -{ - writeStringBinary(toString(), out); -} - -template -void SettingFieldNumberWithAuto::readBinary(ReadBuffer & in) -{ - String str; - readStringBinary(str, in); - parseFromString(str); -} - -SettingFieldMaxThreads::SettingFieldMaxThreads(const Field & f) : SettingFieldMaxThreads(fieldToNumberWithAuto(f)) +SettingFieldMaxThreads::SettingFieldMaxThreads(const Field & f) : SettingFieldMaxThreads(fieldToMaxThreads(f)) { } SettingFieldMaxThreads & SettingFieldMaxThreads::operator=(const Field & f) { - *this = fieldToNumberWithAuto(f); + *this = fieldToMaxThreads(f); return *this; } @@ -248,7 +195,7 @@ String SettingFieldMaxThreads::toString() const void SettingFieldMaxThreads::parseFromString(const String & str) { - *this = stringToNumberWithAuto(str); + *this = stringToMaxThreads(str); } void SettingFieldMaxThreads::writeBinary(WriteBuffer & out) const diff --git a/src/Core/SettingsFields.h b/src/Core/SettingsFields.h index f7946368fc5..0fcc1d6783f 100644 --- a/src/Core/SettingsFields.h +++ b/src/Core/SettingsFields.h @@ -58,45 +58,74 @@ using SettingFieldInt64 = SettingFieldNumber; using SettingFieldFloat = SettingFieldNumber; using SettingFieldBool = SettingFieldNumber; - -/** Like SettingFieldNumber, but also supports the value of 'auto'. - * Note: 0 and 'auto' are not equal. By default when 'auto' is set the value is set to 0. - * But if you need to distinguish between 0 and 'auto', you can use the 'is_auto' flag. - * Serialized as string. +/** Wraps any SettingField to support special value 'auto' that can be checked with `is_auto` flag. + * Note about serialization: + * The new versions with `SettingsWriteFormat::STRINGS_WITH_FLAGS` serialize values as a string. + * In legacy SettingsWriteFormat mode, functions `read/writeBinary` would serialize values as a binary, and 'is_auto' would be ignored. + * It's possible to upgrade settings from regular type to wrapped ones and keep compatibility with old versions, + * but when serializing 'auto' old version will see binary representation of the default value. */ -template -struct SettingFieldNumberWithAuto +template +struct SettingWithAuto { - using Type = T; + constexpr static auto keyword = "auto"; + static bool isAuto(const Field & f) { return f.getType() == Field::Types::String && f.safeGet() == keyword; } + static bool isAuto(const String & str) { return str == keyword; } - bool is_auto; - T value; + using Type = typename Base::Type; + + Base base; + bool is_auto = false; bool changed = false; - explicit SettingFieldNumberWithAuto() : is_auto(true), value(0) {} - explicit SettingFieldNumberWithAuto(T x) : is_auto(false), value(x) {} - explicit SettingFieldNumberWithAuto(const Field & f); + explicit SettingWithAuto() : is_auto(true) {} + explicit SettingWithAuto(Type val) : is_auto(false) { base = Base(val); } - SettingFieldNumberWithAuto & operator=(T x) { is_auto = false; value = x; changed = true; return *this; } - SettingFieldNumberWithAuto & operator=(const Field & f); + explicit SettingWithAuto(const Field & f) + : is_auto(isAuto(f)) + { + if (!is_auto) + base = Base(f); + } - operator T() const { return value; } /// NOLINT - explicit operator Field() const { return is_auto ? Field("auto") : Field(value); } + SettingWithAuto & operator=(const Field & f) + { + changed = true; + if (is_auto = isAuto(f); !is_auto) + base = f; + return *this; + } - String toString() const; - void parseFromString(const String & str); + explicit operator Field() const { return is_auto ? Field(keyword) : Field(base); } - void writeBinary(WriteBuffer & out) const; - void readBinary(ReadBuffer & in); + String toString() const { return is_auto ? keyword : base.toString(); } - T valueOr(T default_value) const { return is_auto ? default_value : value; } + void parseFromString(const String & str) + { + changed = true; + if (is_auto = isAuto(str); !is_auto) + base.parseFromString(str); + } + + void writeBinary(WriteBuffer & out) const + { + if (is_auto) + Base().writeBinary(out); /// serialize default value + else + base.writeBinary(out); + } + + void readBinary(ReadBuffer & in) { changed = true; is_auto = false; base.readBinary(in); } + + Type valueOr(Type default_value) const { return is_auto ? default_value : base.value; } }; -using SettingFieldUInt64WithAuto = SettingFieldNumberWithAuto; -using SettingFieldInt64WithAuto = SettingFieldNumberWithAuto; +using SettingFieldUInt64WithAuto = SettingWithAuto; +using SettingFieldInt64WithAuto = SettingWithAuto; +using SettingFieldFloatWithAuto = SettingWithAuto; -/* Similar to SettingFieldNumberWithAuto with small differences to behave like regular UInt64, supported to compatibility. - * When setting to 'auto' it becames equal to the number of processor cores without taking into account SMT. +/* Similar to SettingFieldUInt64WithAuto with small differences to behave like regular UInt64, supported to compatibility. + * When setting to 'auto' it becomes equal to the number of processor cores without taking into account SMT. * A value of 0 is also treated as 'auto', so 'auto' is parsed and serialized in the same way as 0. */ struct SettingFieldMaxThreads From ae1db8386bf28563f993d3927411968f7f4f9455 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 9 Aug 2022 16:16:42 +0000 Subject: [PATCH 086/164] Change type of insert_quorum to UInt64WithAuto --- src/Core/Settings.h | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 92af0576129..727e45e3e50 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -212,7 +212,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) \ M(Bool, insert_deduplicate, true, "For INSERT queries in the replicated table, specifies that deduplication of insertings blocks should be performed", 0) \ \ - M(UInt64, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled.", 0) \ + M(UInt64WithAuto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled.", 0) \ M(Milliseconds, insert_quorum_timeout, 600000, "", 0) \ M(Bool, insert_quorum_parallel, true, "For quorum INSERT queries - enable to make parallel inserts without linearizability", 0) \ M(UInt64, select_sequential_consistency, 0, "For SELECT queries from the replicated table, throw an exception if the replica does not have a chunk written with the quorum; do not read the parts that have not yet been written with the quorum.", 0) \ diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index b4fc6b34c9e..3e5c1ce6982 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4443,8 +4443,9 @@ SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, con bool deduplicate = storage_settings_ptr->replicated_deduplication_window != 0 && query_settings.insert_deduplicate; // TODO: should we also somehow pass list of columns to deduplicate on to the ReplicatedMergeTreeSink? + // TODO: insert_quorum = 'auto' would be supported in https://github.com/ClickHouse/ClickHouse/pull/39970, now it's same as 0. return std::make_shared( - *this, metadata_snapshot, query_settings.insert_quorum, + *this, metadata_snapshot, query_settings.insert_quorum.valueOr(0), query_settings.insert_quorum_timeout.totalMilliseconds(), query_settings.max_partitions_per_insert_block, query_settings.insert_quorum_parallel, From 21ace403ab98f4fc3b221de2376d09deb9c1ffde Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 9 Aug 2022 17:04:15 +0000 Subject: [PATCH 087/164] Add test setting_value_auto --- .../0_stateless/02381_setting_value_auto.reference | 4 ++++ tests/queries/0_stateless/02381_setting_value_auto.sql | 10 ++++++++++ 2 files changed, 14 insertions(+) create mode 100644 tests/queries/0_stateless/02381_setting_value_auto.reference create mode 100644 tests/queries/0_stateless/02381_setting_value_auto.sql diff --git a/tests/queries/0_stateless/02381_setting_value_auto.reference b/tests/queries/0_stateless/02381_setting_value_auto.reference new file mode 100644 index 00000000000..72c87cf6f7d --- /dev/null +++ b/tests/queries/0_stateless/02381_setting_value_auto.reference @@ -0,0 +1,4 @@ +0 0 UInt64WithAuto +auto 1 UInt64WithAuto +0 1 UInt64WithAuto +1 1 UInt64WithAuto diff --git a/tests/queries/0_stateless/02381_setting_value_auto.sql b/tests/queries/0_stateless/02381_setting_value_auto.sql new file mode 100644 index 00000000000..5b536a9d749 --- /dev/null +++ b/tests/queries/0_stateless/02381_setting_value_auto.sql @@ -0,0 +1,10 @@ +SELECT value, changed, type FROM system.settings WHERE name = 'insert_quorum'; + +SET insert_quorum = 'auto'; +SELECT value, changed, type FROM system.settings WHERE name = 'insert_quorum'; + +SET insert_quorum = 0; +SELECT value, changed, type FROM system.settings WHERE name = 'insert_quorum'; + +SET insert_quorum = 1; +SELECT value, changed, type FROM system.settings WHERE name = 'insert_quorum'; From 7c364d4124e912bc1b2be90905b63a78ca8e0c69 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 9 Aug 2022 17:24:37 +0000 Subject: [PATCH 088/164] fix parsing of tuples in case of errors --- src/Columns/ColumnArray.cpp | 4 +- .../Serializations/SerializationTuple.cpp | 38 +++++++++---------- .../02381_parse_array_of_tuples.reference | 3 ++ .../02381_parse_array_of_tuples.sql | 14 +++++++ 4 files changed, 39 insertions(+), 20 deletions(-) create mode 100644 tests/queries/0_stateless/02381_parse_array_of_tuples.reference create mode 100644 tests/queries/0_stateless/02381_parse_array_of_tuples.sql diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 24da9644335..bf646139b41 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -56,7 +56,9 @@ ColumnArray::ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr && /// This will also prevent possible overflow in offset. if (data->size() != last_offset) - throw Exception("offsets_column has data inconsistent with nested_column", ErrorCodes::LOGICAL_ERROR); + throw Exception( ErrorCodes::LOGICAL_ERROR, + "offsets_column has data inconsistent with nested_column. Data size: {}, last offset: {}", + data->size(), last_offset); } /** NOTE diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 8dc15fc9841..8138b15c9af 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -135,20 +135,21 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co } elems[i]->deserializeTextQuoted(extractElementColumn(column, i), istr, settings); } - }); - // Special format for one element tuple (1,) - if (1 == elems.size()) - { + // Special format for one element tuple (1,) + if (1 == elems.size()) + { + skipWhitespaceIfAny(istr); + // Allow both (1) and (1,) + checkChar(',', istr); + } + skipWhitespaceIfAny(istr); - // Allow both (1) and (1,) - checkChar(',', istr); - } - skipWhitespaceIfAny(istr); - assertChar(')', istr); + assertChar(')', istr); - if (whole && !istr.eof()) - throwUnexpectedDataAfterParsedValue(column, istr, settings, "Tuple"); + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "Tuple"); + }); } void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -213,19 +214,18 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr auto & element_column = extractElementColumn(column, element_pos); elems[element_pos]->deserializeTextJSON(element_column, istr, settings); } - }); - skipWhitespaceIfAny(istr); - assertChar('}', istr); + skipWhitespaceIfAny(istr); + assertChar('}', istr); + }); } else { - const size_t size = elems.size(); assertChar('[', istr); addElementSafe(elems.size(), column, [&] { - for (size_t i = 0; i < size; ++i) + for (size_t i = 0; i < elems.size(); ++i) { skipWhitespaceIfAny(istr); if (i != 0) @@ -235,10 +235,10 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr } elems[i]->deserializeTextJSON(extractElementColumn(column, i), istr, settings); } - }); - skipWhitespaceIfAny(istr); - assertChar(']', istr); + skipWhitespaceIfAny(istr); + assertChar(']', istr); + }); } } diff --git a/tests/queries/0_stateless/02381_parse_array_of_tuples.reference b/tests/queries/0_stateless/02381_parse_array_of_tuples.reference new file mode 100644 index 00000000000..fa7137d800d --- /dev/null +++ b/tests/queries/0_stateless/02381_parse_array_of_tuples.reference @@ -0,0 +1,3 @@ +1 [[]] +2 [[(500,246)]] +3 [[(500,10)]] diff --git a/tests/queries/0_stateless/02381_parse_array_of_tuples.sql b/tests/queries/0_stateless/02381_parse_array_of_tuples.sql new file mode 100644 index 00000000000..51db5a0fe9b --- /dev/null +++ b/tests/queries/0_stateless/02381_parse_array_of_tuples.sql @@ -0,0 +1,14 @@ +DROP TABLE IF EXISTS t_parse_tuples; + +CREATE TABLE t_parse_tuples +( + id UInt32, + arr Array(Array(Tuple(c1 Int32, c2 UInt8))) +) +ENGINE = Memory; + +INSERT INTO t_parse_tuples VALUES (1, [[]]), (2, [[(500, -10)]]), (3, [[(500, '10')]]); + +SELECT * FROM t_parse_tuples ORDER BY id; + +DROP TABLE IF EXISTS t_parse_tuples; From c6ec8a4bcf1835072dfe8908411a334260313097 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 9 Aug 2022 23:45:21 +0200 Subject: [PATCH 089/164] tests: attempt to make 02293_part_log_has_merge_reason less flaky CI: https://s3.amazonaws.com/clickhouse-test-reports/40003/acc33d4a116802eedc4e089f7e1612efe8e62d4d/stateless_tests__release__wide_parts_enabled_.html Signed-off-by: Azat Khuzhin --- .../queries/0_stateless/02293_part_log_has_merge_reason.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql index 002bc1f37dd..9d2575314d4 100644 --- a/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql +++ b/tests/queries/0_stateless/02293_part_log_has_merge_reason.sql @@ -6,10 +6,10 @@ CREATE TABLE t_part_log_has_merge_type_table UserID UInt64, Comment String ) -ENGINE = MergeTree() +ENGINE = MergeTree() ORDER BY tuple() TTL event_time + INTERVAL 3 MONTH -SETTINGS min_bytes_for_wide_part = 0, materialize_ttl_recalculate_only = true; +SETTINGS min_bytes_for_wide_part = 0, materialize_ttl_recalculate_only = true, max_number_of_merges_with_ttl_in_pool = 100; INSERT INTO t_part_log_has_merge_type_table VALUES (now(), 1, 'username1'); INSERT INTO t_part_log_has_merge_type_table VALUES (now() - INTERVAL 4 MONTH, 2, 'username2'); @@ -20,7 +20,7 @@ SYSTEM FLUSH LOGS; SELECT event_type, - merge_reason + merge_reason FROM system.part_log WHERE From be51187f85b08b89d789e61f42052d236133d783 Mon Sep 17 00:00:00 2001 From: Tanya Bragin Date: Tue, 9 Aug 2022 17:27:06 -0700 Subject: [PATCH 090/164] Update tests to use UTC, and docs to consistent order --- .../functions/type-conversion-functions.md | 8 +++---- .../functions/type-conversion-functions.md | 8 +++---- ...2381_parseDateTime64BestEffortUS.reference | 12 ++++++++++ .../02381_parseDateTime64BestEffortUS.sql | 22 +++++++++++++++++++ 4 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 tests/queries/0_stateless/02381_parseDateTime64BestEffortUS.reference create mode 100644 tests/queries/0_stateless/02381_parseDateTime64BestEffortUS.sql diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 51bec17ab94..9514760fd7d 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1452,6 +1452,10 @@ Result: └────────────────────────────┴────────────────────────────────┘ ``` +## parseDateTime64BestEffortUS + +Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity. + ## parseDateTime64BestEffortOrNull Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort) except that it returns `NULL` when it encounters a date format that cannot be processed. @@ -1460,10 +1464,6 @@ Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort) except that Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. -## parseDateTime64BestEffortUS - -Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity. - ## parseDateTime64BestEffortUSOrNull Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity and returns `NULL` when it encounters a date format that cannot be processed. diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index efb9c5c623a..56e63b793f3 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -1374,6 +1374,10 @@ FORMAT PrettyCompactMonoBlock; └────────────────────────────┴────────────────────────────────┘ ``` +## parseDateTime64BestEffortUS {#parsedatetime64besteffortus} + +Работает аналогично функции [parseDateTime64BestEffort](#parsedatetime64besteffort), но разница состоит в том, что в она предполагает американский формат даты (`MM/DD/YYYY` etc.) в случае неоднозначности. + ## parseDateTime64BestEffortOrNull {#parsedatetime64besteffortornull} Работает аналогично функции [parseDateTime64BestEffort](#parsedatetime64besteffort), но возвращает `NULL`, если формат даты не может быть обработан. @@ -1382,10 +1386,6 @@ FORMAT PrettyCompactMonoBlock; Работает аналогично функции [parseDateTime64BestEffort](#parsedatetime64besteffort), но возвращает нулевую дату и время, если формат даты не может быть обработан. -## parseDateTime64BestEffortUS {#parsedatetime64besteffortus} - -Работает аналогично функции [parseDateTime64BestEffort](#parsedatetime64besteffort), но разница состоит в том, что в она предполагает американский формат даты (`MM/DD/YYYY` etc.) в случае неоднозначности. - ## parseDateTime64BestEffortUSOrNull {#parsedatetime64besteffortusornull} Работает аналогично функции [parseDateTime64BestEffort](#parsedatetime64besteffort), но разница состоит в том, что в она предполагает американский формат даты (`MM/DD/YYYY` etc.) в случае неоднозначности и возвращает `NULL`, если формат даты не может быть обработан. diff --git a/tests/queries/0_stateless/02381_parseDateTime64BestEffortUS.reference b/tests/queries/0_stateless/02381_parseDateTime64BestEffortUS.reference new file mode 100644 index 00000000000..c9a13c97baf --- /dev/null +++ b/tests/queries/0_stateless/02381_parseDateTime64BestEffortUS.reference @@ -0,0 +1,12 @@ +parseDateTime64BestEffortUS + s a + + 01-02-1930 12:00:00 1930-01-02 12:00:00.000 + 12.02.1930 12:00:00 1930-12-02 12:00:00.000 + 13/02/1930 12:00:00 1930-02-13 12:00:00.000 + 02/25/1930 12:00:00 1930-02-25 12:00:00.000 + +parseDateTime64BestEffortUSOrNull +\N +parseDateTime64BestEffortUSOrZero +1970-01-01 00:00:00.000 diff --git a/tests/queries/0_stateless/02381_parseDateTime64BestEffortUS.sql b/tests/queries/0_stateless/02381_parseDateTime64BestEffortUS.sql new file mode 100644 index 00000000000..21dc7b1a990 --- /dev/null +++ b/tests/queries/0_stateless/02381_parseDateTime64BestEffortUS.sql @@ -0,0 +1,22 @@ +SELECT 'parseDateTime64BestEffortUS'; + +SELECT + s, + parseDateTime64BestEffortUS(s,3,'UTC') AS a +FROM +( + SELECT arrayJoin([ +'01-02-1930 12:00:00', +'12.02.1930 12:00:00', +'13/02/1930 12:00:00', +'02/25/1930 12:00:00' +]) AS s) +FORMAT PrettySpaceNoEscapes; + +SELECT ''; + +SELECT 'parseDateTime64BestEffortUSOrNull'; +SELECT parseDateTime64BestEffortUSOrNull('01/45/1925 16:00:00',3,'UTC'); + +SELECT 'parseDateTime64BestEffortUSOrZero'; +SELECT parseDateTime64BestEffortUSOrZero('01/45/1925 16:00:00',3,'UTC'); From 972fd418cb14208007124ac5d666849177b6c2b0 Mon Sep 17 00:00:00 2001 From: Tanya Bragin Date: Tue, 9 Aug 2022 17:30:34 -0700 Subject: [PATCH 091/164] Delete old files --- ...arse_date_time_64_best_effort_us.reference | 12 ---------- ...2381_parse_date_time_64_best_effort_us.sql | 22 ------------------- 2 files changed, 34 deletions(-) delete mode 100644 tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.reference delete mode 100644 tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.sql diff --git a/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.reference b/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.reference deleted file mode 100644 index be9aaafc5db..00000000000 --- a/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.reference +++ /dev/null @@ -1,12 +0,0 @@ -parseDateTime64BestEffortUS - s a - - 01-02-1930 12:00:00 1930-01-02 12:00:00.000 - 12.02.1930 12:00:00 1930-12-02 12:00:00.000 - 13/02/1930 12:00:00 1930-02-13 12:00:00.000 - 02/25/1930 12:00:00 1930-02-25 12:00:00.000 - -parseDateTime64BestEffortUSOrNull -\N -parseDateTime64BestEffortUSOrZero -1969-12-31 16:00:00.000 diff --git a/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.sql b/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.sql deleted file mode 100644 index 20c0e7ef570..00000000000 --- a/tests/queries/0_stateless/02381_parse_date_time_64_best_effort_us.sql +++ /dev/null @@ -1,22 +0,0 @@ -SELECT 'parseDateTime64BestEffortUS'; - -SELECT - s, - parseDateTime64BestEffortUS(s) AS a -FROM -( - SELECT arrayJoin([ -'01-02-1930 12:00:00', -'12.02.1930 12:00:00', -'13/02/1930 12:00:00', -'02/25/1930 12:00:00' -]) AS s) -FORMAT PrettySpaceNoEscapes; - -SELECT ''; - -SELECT 'parseDateTime64BestEffortUSOrNull'; -SELECT parseDateTime64BestEffortUSOrNull('01/45/1925 16:00:00'); - -SELECT 'parseDateTime64BestEffortUSOrZero'; -SELECT parseDateTime64BestEffortUSOrZero('01/45/1925 16:00:00'); From 810221baf21c8e055839035984aae5d687dda39a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 10 Aug 2022 07:39:32 +0000 Subject: [PATCH 092/164] Assume unversioned server has version=0 and use tryParse() instead of from_chars() --- .../library-bridge/LibraryBridgeHandlers.cpp | 25 ++++++++--------- programs/odbc-bridge/ColumnInfoHandler.cpp | 27 ++++++++---------- .../odbc-bridge/IdentifierQuoteHandler.cpp | 27 ++++++++---------- programs/odbc-bridge/MainHandler.cpp | 26 ++++++++--------- programs/odbc-bridge/SchemaAllowedHandler.cpp | 28 +++++++++---------- 5 files changed, 60 insertions(+), 73 deletions(-) diff --git a/programs/library-bridge/LibraryBridgeHandlers.cpp b/programs/library-bridge/LibraryBridgeHandlers.cpp index bdb5f3ca02b..a28148bd1f7 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.cpp +++ b/programs/library-bridge/LibraryBridgeHandlers.cpp @@ -19,7 +19,6 @@ #include #include #include -#include namespace DB @@ -92,27 +91,25 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ LOG_TRACE(log, "Request URI: {}", request.getURI()); HTMLForm params(getContext()->getSettingsRef(), request); + size_t version; + if (!params.has("version")) - { - processError(response, "No 'version' in request URL"); - return; - } + version = 0; /// assumed version for too old servers which do not send a version else { String version_str = params.get("version"); - size_t version; - auto [_, ec] = std::from_chars(version_str.data(), version_str.data() + version_str.size(), version); - if (ec != std::errc()) + if (!tryParse(version, version_str)) { processError(response, "Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); return; } - if (version != LIBRARY_BRIDGE_PROTOCOL_VERSION) - { - // backwards compatibility is for now deemed unnecessary, just let the user upgrade the server and bridge to the same version - processError(response, "Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); - return; - } + } + + if (version != LIBRARY_BRIDGE_PROTOCOL_VERSION) + { + /// backwards compatibility is considered unnecessary for now, just let the user know that the server and the bridge must be upgraded together + processError(response, "Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; } if (!params.has("method")) diff --git a/programs/odbc-bridge/ColumnInfoHandler.cpp b/programs/odbc-bridge/ColumnInfoHandler.cpp index 7d2f6f57d34..0ea2495af78 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.cpp +++ b/programs/odbc-bridge/ColumnInfoHandler.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -23,8 +24,6 @@ #include #include -#include - namespace DB { @@ -83,27 +82,25 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ LOG_WARNING(log, fmt::runtime(message)); }; + size_t version; + if (!params.has("version")) - { - process_error("No 'version' in request URL"); - return; - } + version = 0; /// assumed version for too old servers which do not send a version else { String version_str = params.get("version"); - size_t version; - auto [_, ec] = std::from_chars(version_str.data(), version_str.data() + version_str.size(), version); - if (ec != std::errc()) + if (!tryParse(version, version_str)) { process_error("Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); return; } - if (version != XDBC_BRIDGE_PROTOCOL_VERSION) - { - // backwards compatibility is for now deemed unnecessary, just let the user upgrade the server and bridge to the same version - process_error("Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); - return; - } + } + + if (version != XDBC_BRIDGE_PROTOCOL_VERSION) + { + /// backwards compatibility is considered unnecessary for now, just let the user know that the server and the bridge must be upgraded together + process_error("Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; } if (!params.has("table")) diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.cpp b/programs/odbc-bridge/IdentifierQuoteHandler.cpp index b162a2ea324..8157e3b6159 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.cpp +++ b/programs/odbc-bridge/IdentifierQuoteHandler.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -17,8 +18,6 @@ #include "validateODBCConnectionString.h" #include "ODBCPooledConnectionFactory.h" -#include - namespace DB { @@ -35,27 +34,25 @@ void IdentifierQuoteHandler::handleRequest(HTTPServerRequest & request, HTTPServ LOG_WARNING(log, fmt::runtime(message)); }; + size_t version; + if (!params.has("version")) - { - process_error("No 'version' in request URL"); - return; - } + version = 0; /// assumed version for too old servers which do not send a version else { String version_str = params.get("version"); - size_t version; - auto [_, ec] = std::from_chars(version_str.data(), version_str.data() + version_str.size(), version); - if (ec != std::errc()) + if (!tryParse(version, version_str)) { process_error("Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); return; } - if (version != XDBC_BRIDGE_PROTOCOL_VERSION) - { - // backwards compatibility is for now deemed unnecessary, just let the user upgrade the server and bridge to the same version - process_error("Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); - return; - } + } + + if (version != XDBC_BRIDGE_PROTOCOL_VERSION) + { + /// backwards compatibility is considered unnecessary for now, just let the user know that the server and the bridge must be upgraded together + process_error("Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; } if (!params.has("connection_string")) diff --git a/programs/odbc-bridge/MainHandler.cpp b/programs/odbc-bridge/MainHandler.cpp index 6ece12198d3..fe22d8facfd 100644 --- a/programs/odbc-bridge/MainHandler.cpp +++ b/programs/odbc-bridge/MainHandler.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -57,29 +56,28 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse HTMLForm params(getContext()->getSettingsRef(), request); LOG_TRACE(log, "Request URI: {}", request.getURI()); + size_t version; + if (!params.has("version")) - { - processError(response, "No 'version' in request URL"); - return; - } + version = 0; /// assumed version for too old servers which do not send a version else { String version_str = params.get("version"); - size_t version; - auto [_, ec] = std::from_chars(version_str.data(), version_str.data() + version_str.size(), version); - if (ec != std::errc()) + if (!tryParse(version, version_str)) { processError(response, "Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); return; } - if (version != XDBC_BRIDGE_PROTOCOL_VERSION) - { - // backwards compatibility is for now deemed unnecessary, just let the user upgrade the server and bridge to the same version - processError(response, "Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); - return; - } } + if (version != XDBC_BRIDGE_PROTOCOL_VERSION) + { + /// backwards compatibility is considered unnecessary for now, just let the user know that the server and the bridge must be upgraded together + processError(response, "Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; + } + + if (mode == "read") params.read(request.getStream()); diff --git a/programs/odbc-bridge/SchemaAllowedHandler.cpp b/programs/odbc-bridge/SchemaAllowedHandler.cpp index f70474fc898..4d20c8bc3b7 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.cpp +++ b/programs/odbc-bridge/SchemaAllowedHandler.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -14,8 +15,6 @@ #include #include -#include - namespace DB { @@ -43,29 +42,28 @@ void SchemaAllowedHandler::handleRequest(HTTPServerRequest & request, HTTPServer LOG_WARNING(log, fmt::runtime(message)); }; + size_t version; + if (!params.has("version")) - { - process_error("No 'version' in request URL"); - return; - } + version = 0; /// assumed version for too old servers which do not send a version else { String version_str = params.get("version"); - size_t version; - auto [_, ec] = std::from_chars(version_str.data(), version_str.data() + version_str.size(), version); - if (ec != std::errc()) + if (!tryParse(version, version_str)) { process_error("Unable to parse 'version' string in request URL: '" + version_str + "' Check if the server and library-bridge have the same version."); return; } - if (version != XDBC_BRIDGE_PROTOCOL_VERSION) - { - // backwards compatibility is for now deemed unnecessary, just let the user upgrade the server and bridge to the same version - process_error("Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); - return; - } } + if (version != XDBC_BRIDGE_PROTOCOL_VERSION) + { + /// backwards compatibility is considered unnecessary for now, just let the user know that the server and the bridge must be upgraded together + process_error("Server and library-bridge have different versions: '" + std::to_string(version) + "' vs. '" + std::to_string(LIBRARY_BRIDGE_PROTOCOL_VERSION) + "'"); + return; + } + + if (!params.has("connection_string")) { process_error("No 'connection_string' in request URL"); From 537ba613dcc5c2f23f5763f894dcbe2b9823d32d Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 10 Aug 2022 08:14:16 +0000 Subject: [PATCH 093/164] Remove clamping from AddDaysImpl and AddWeeksImpl; fix 01921_datatype_date32 test --- src/Functions/FunctionDateOrDateTimeAddInterval.h | 8 ++++---- tests/queries/0_stateless/01921_datatype_date32.reference | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h index bf2f530cb7f..fbfc9e9bc1f 100644 --- a/src/Functions/FunctionDateOrDateTimeAddInterval.h +++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h @@ -288,12 +288,12 @@ struct AddDaysImpl static inline NO_SANITIZE_UNDEFINED UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl &, UInt16 = 0) { - return static_cast(std::clamp(d + delta, 0L, 65535L)); + return d + delta; } static inline NO_SANITIZE_UNDEFINED Int32 execute(Int32 d, Int64 delta, const DateLUTImpl &, UInt16 = 0) { - return std::max(static_cast(d + delta), -static_cast(DateLUT::instance().getDayNumOffsetEpoch())); + return d + delta; } }; @@ -322,12 +322,12 @@ struct AddWeeksImpl static inline NO_SANITIZE_UNDEFINED UInt16 execute(UInt16 d, Int32 delta, const DateLUTImpl &, UInt16 = 0) { - return static_cast(std::clamp(d + delta * 7, 0, 65535)); + return d + delta * 7; } static inline NO_SANITIZE_UNDEFINED Int32 execute(Int32 d, Int32 delta, const DateLUTImpl &, UInt16 = 0) { - return std::max(static_cast(d + delta * 7), -static_cast(DateLUT::instance().getDayNumOffsetEpoch())); + return d + delta * 7; } }; diff --git a/tests/queries/0_stateless/01921_datatype_date32.reference b/tests/queries/0_stateless/01921_datatype_date32.reference index 8cc9cc2886f..acb0cc4ca59 100644 --- a/tests/queries/0_stateless/01921_datatype_date32.reference +++ b/tests/queries/0_stateless/01921_datatype_date32.reference @@ -248,14 +248,14 @@ 2299-12-30 23:00:00.000 2021-06-21 23:00:00.000 -------subtractDays--------- -1900-01-01 -1900-01-01 +2299-12-31 +2299-12-31 2299-12-08 2299-12-24 2021-06-15 -------subtractWeeks--------- -1900-01-01 -1900-01-01 +2299-12-31 +2299-12-31 2299-12-08 2299-12-24 2021-06-15 From 36ea40ab64430b5a93eeca8d6ed5ee56a6d66522 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 10 Aug 2022 11:21:42 +0200 Subject: [PATCH 094/164] Fix clickhouse-test hang in case of CREATE DATABASE fails Signed-off-by: Azat Khuzhin --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 952fc7fb0a9..96a7c929260 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -914,7 +914,7 @@ class TestCase: description_full += result.description description_full += "\n" - if result.status == TestStatus.FAIL: + if result.status == TestStatus.FAIL and self.testcase_args: description_full += "Database: " + self.testcase_args.testcase_database result.description = description_full From 6164506a5ebb50bd451c4305bbc30a54085106da Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 10 Aug 2022 10:52:58 +0200 Subject: [PATCH 095/164] tests: fix 02380_insert_mv_race for Ordinary database Signed-off-by: Azat Khuzhin --- tests/queries/0_stateless/02380_insert_mv_race.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02380_insert_mv_race.sh b/tests/queries/0_stateless/02380_insert_mv_race.sh index 3edf99fb502..725c7eacce6 100755 --- a/tests/queries/0_stateless/02380_insert_mv_race.sh +++ b/tests/queries/0_stateless/02380_insert_mv_race.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, race, no-ordinary-database +# Tags: long, race # Regression test for INSERT into table with MV attached, # to avoid possible errors if some table will disappears, @@ -9,6 +9,12 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh +$CLICKHOUSE_CLIENT -nm -q "ATTACH TABLE mv" |& { + # CANNOT_GET_CREATE_TABLE_QUERY -- ATTACH TABLE IF EXISTS + # TABLE_ALREADY_EXISTS -- ATTACH TABLE IF NOT EXISTS + grep -F -m1 Exception | grep -v -e CANNOT_GET_CREATE_TABLE_QUERY -e TABLE_ALREADY_EXISTS +} + $CLICKHOUSE_CLIENT -nm -q " DROP TABLE IF EXISTS null; CREATE TABLE null (key Int) ENGINE = Null; @@ -23,4 +29,8 @@ $CLICKHOUSE_CLIENT -q "INSERT INTO null SELECT * FROM numbers_mt(1000) settings } & sleep 0.05 $CLICKHOUSE_CLIENT -q "DETACH TABLE mv" + +# avoid leftovers on DROP DATABASE (force_remove_data_recursively_on_drop) for Ordinary database +$CLICKHOUSE_CLIENT -q "ATTACH TABLE mv" + wait From cdd5b32d297e1a953d1d31bfed2db78651bdb0b8 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Tue, 9 Aug 2022 16:46:23 +0800 Subject: [PATCH 096/164] fix HashMethodOneNumber with const column --- src/Common/ColumnsHashing.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index e921f4fbf9a..3c44c89674e 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -39,15 +39,18 @@ struct HashMethodOneNumber using Base = columns_hashing_impl::HashMethodBase; const char * vec; + ColumnPtr owned_column; /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise. HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { - vec = key_columns[0]->getRawData().data; + owned_column = key_columns[0]->convertToFullColumnIfConst(); + vec = owned_column->getRawData().data; } explicit HashMethodOneNumber(const IColumn * column) { + owned_column = column->convertToFullColumnIfConst(); vec = column->getRawData().data; } From 1a08161dbfcf1fd3885c4eaff39dc7f533ec7fc9 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Tue, 9 Aug 2022 17:18:21 +0800 Subject: [PATCH 097/164] simplify solution --- src/Common/ColumnsHashing.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index 3c44c89674e..b558aaccc7e 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -39,19 +39,19 @@ struct HashMethodOneNumber using Base = columns_hashing_impl::HashMethodBase; const char * vec; - ColumnPtr owned_column; + bool is_column_const; /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise. HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { - owned_column = key_columns[0]->convertToFullColumnIfConst(); - vec = owned_column->getRawData().data; + vec = key_columns[0]->getRawData().data; + is_column_const = isColumnConst(*key_columns[0]); } explicit HashMethodOneNumber(const IColumn * column) { - owned_column = column->convertToFullColumnIfConst(); vec = column->getRawData().data; + is_column_const = isColumnConst(*column); } /// Creates context. Method is called once and result context is used in all threads. @@ -69,7 +69,11 @@ struct HashMethodOneNumber using Base::getHash; /// (const Data & data, size_t row, Arena & pool) -> size_t /// Is used for default implementation in HashMethodBase. - FieldType getKeyHolder(size_t row, Arena &) const { return unalignedLoad(vec + row * sizeof(FieldType)); } + FieldType getKeyHolder(size_t row, Arena &) const + { + size_t pos = is_column_const ? 0 : row; + return unalignedLoad(vec + pos * sizeof(FieldType)); + } const FieldType * getKeyData() const { return reinterpret_cast(vec); } }; From 7910a93b02bb524e88bd68353e0547b6c3a26da3 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Tue, 9 Aug 2022 18:25:52 +0800 Subject: [PATCH 098/164] fix other hash methods and add tests --- src/Common/ColumnsHashing.h | 20 +++++++++++++------ .../02381_intersect_hash_method.reference | 9 +++++++++ .../02381_intersect_hash_method.sql | 3 +++ 3 files changed, 26 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/02381_intersect_hash_method.reference create mode 100644 tests/queries/0_stateless/02381_intersect_hash_method.sql diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index b558aaccc7e..877e57db3d6 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -39,19 +39,19 @@ struct HashMethodOneNumber using Base = columns_hashing_impl::HashMethodBase; const char * vec; - bool is_column_const; + bool column_is_const = false; /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise. HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { vec = key_columns[0]->getRawData().data; - is_column_const = isColumnConst(*key_columns[0]); + column_is_const = isColumnConst(*key_columns[0]); } explicit HashMethodOneNumber(const IColumn * column) { vec = column->getRawData().data; - is_column_const = isColumnConst(*column); + column_is_const = isColumnConst(*column); } /// Creates context. Method is called once and result context is used in all threads. @@ -71,7 +71,7 @@ struct HashMethodOneNumber /// Is used for default implementation in HashMethodBase. FieldType getKeyHolder(size_t row, Arena &) const { - size_t pos = is_column_const ? 0 : row; + size_t pos = column_is_const ? 0 : row; return unalignedLoad(vec + pos * sizeof(FieldType)); } @@ -89,12 +89,16 @@ struct HashMethodString const IColumn::Offset * offsets; const UInt8 * chars; + bool column_is_const = false; HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { const IColumn * column = key_columns[0]; if (isColumnConst(*column)) + { + column_is_const = true; column = &assert_cast(*column).getDataColumn(); + } const ColumnString & column_string = assert_cast(*column); offsets = column_string.getOffsets().data(); @@ -103,7 +107,8 @@ struct HashMethodString auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena & pool) const { - StringRef key(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1); + ssize_t pos = column_is_const ? 0 : row; + StringRef key(chars + offsets[pos - 1], offsets[pos] - offsets[pos - 1] - 1); if constexpr (place_string_to_arena) { @@ -131,6 +136,7 @@ struct HashMethodFixedString size_t n; const ColumnFixedString::Chars * chars; + bool column_is_const = false; HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { @@ -138,11 +144,13 @@ struct HashMethodFixedString const ColumnFixedString & column_string = assert_cast(column); n = column_string.getN(); chars = &column_string.getChars(); + column_is_const = isColumnConst(column); } auto getKeyHolder(size_t row, [[maybe_unused]] Arena & pool) const { - StringRef key(&(*chars)[row * n], n); + size_t pos = column_is_const ? 0 : row; + StringRef key(&(*chars)[pos * n], n); if constexpr (place_string_to_arena) { diff --git a/tests/queries/0_stateless/02381_intersect_hash_method.reference b/tests/queries/0_stateless/02381_intersect_hash_method.reference new file mode 100644 index 00000000000..bb0850568bb --- /dev/null +++ b/tests/queries/0_stateless/02381_intersect_hash_method.reference @@ -0,0 +1,9 @@ +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02381_intersect_hash_method.sql b/tests/queries/0_stateless/02381_intersect_hash_method.sql new file mode 100644 index 00000000000..cc1967dafd7 --- /dev/null +++ b/tests/queries/0_stateless/02381_intersect_hash_method.sql @@ -0,0 +1,3 @@ +SELECT 1 FROM numbers(3) INTERSECT SELECT 1 FROM numbers(3); +SELECT toString(1) FROM numbers(3) INTERSECT SELECT toString(1) FROM numbers(3); +SELECT '1' FROM numbers(3) INTERSECT SELECT '1' FROM numbers(3); From 9e865d48ce6466287df427dac35a9dd262e88060 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Tue, 9 Aug 2022 18:50:17 +0800 Subject: [PATCH 099/164] increase test size --- .../02381_intersect_hash_method.reference | 21 +++++++++++++++++++ .../02381_intersect_hash_method.sql | 6 +++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02381_intersect_hash_method.reference b/tests/queries/0_stateless/02381_intersect_hash_method.reference index bb0850568bb..ac8f48bbb7b 100644 --- a/tests/queries/0_stateless/02381_intersect_hash_method.reference +++ b/tests/queries/0_stateless/02381_intersect_hash_method.reference @@ -7,3 +7,24 @@ 1 1 1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02381_intersect_hash_method.sql b/tests/queries/0_stateless/02381_intersect_hash_method.sql index cc1967dafd7..1154718c686 100644 --- a/tests/queries/0_stateless/02381_intersect_hash_method.sql +++ b/tests/queries/0_stateless/02381_intersect_hash_method.sql @@ -1,3 +1,3 @@ -SELECT 1 FROM numbers(3) INTERSECT SELECT 1 FROM numbers(3); -SELECT toString(1) FROM numbers(3) INTERSECT SELECT toString(1) FROM numbers(3); -SELECT '1' FROM numbers(3) INTERSECT SELECT '1' FROM numbers(3); +SELECT 1 FROM numbers(10) INTERSECT SELECT 1 FROM numbers(10); +SELECT toString(1) FROM numbers(10) INTERSECT SELECT toString(1) FROM numbers(10); +SELECT '1' FROM numbers(10) INTERSECT SELECT '1' FROM numbers(10); From 17fa8076e32e0c0f0713bb2f4830875ad169f8f6 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Wed, 10 Aug 2022 18:56:18 +0800 Subject: [PATCH 100/164] better implementation --- src/Common/ColumnsHashing.h | 50 ++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index 877e57db3d6..6aef60fffc7 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -7,6 +7,7 @@ #include #include #include "Columns/IColumn.h" +#include "Core/Field.h" #include #include @@ -15,8 +16,10 @@ #include #include +#include #include #include +#include namespace DB @@ -39,19 +42,27 @@ struct HashMethodOneNumber using Base = columns_hashing_impl::HashMethodBase; const char * vec; - bool column_is_const = false; + FieldType const_value; + std::function get_key_holder_impl; /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise. HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { vec = key_columns[0]->getRawData().data; - column_is_const = isColumnConst(*key_columns[0]); + if (isColumnConst(*key_columns[0])) + { + const_value = unalignedLoad(vec); + get_key_holder_impl = [this](size_t /*row*/) { return const_value; }; + } + else + { + get_key_holder_impl = [this](size_t row) { return unalignedLoad(vec + row * sizeof(FieldType)); }; + } } explicit HashMethodOneNumber(const IColumn * column) { vec = column->getRawData().data; - column_is_const = isColumnConst(*column); } /// Creates context. Method is called once and result context is used in all threads. @@ -69,11 +80,7 @@ struct HashMethodOneNumber using Base::getHash; /// (const Data & data, size_t row, Arena & pool) -> size_t /// Is used for default implementation in HashMethodBase. - FieldType getKeyHolder(size_t row, Arena &) const - { - size_t pos = column_is_const ? 0 : row; - return unalignedLoad(vec + pos * sizeof(FieldType)); - } + FieldType getKeyHolder(size_t row, Arena &) const { return get_key_holder_impl(row); } const FieldType * getKeyData() const { return reinterpret_cast(vec); } }; @@ -89,27 +96,28 @@ struct HashMethodString const IColumn::Offset * offsets; const UInt8 * chars; - bool column_is_const = false; + std::function get_key_holder_impl; HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { const IColumn * column = key_columns[0]; - if (isColumnConst(*column)) - { - column_is_const = true; + bool column_is_const = isColumnConst(*column); + if (column_is_const) column = &assert_cast(*column).getDataColumn(); - } const ColumnString & column_string = assert_cast(*column); offsets = column_string.getOffsets().data(); chars = column_string.getChars().data(); + + if (column_is_const) + get_key_holder_impl = [this](size_t /*row*/) { return StringRef(chars, offsets[0] - 1); }; + else + get_key_holder_impl = [this](size_t row) { return StringRef(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1); }; } auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena & pool) const { - ssize_t pos = column_is_const ? 0 : row; - StringRef key(chars + offsets[pos - 1], offsets[pos] - offsets[pos - 1] - 1); - + StringRef key = get_key_holder_impl(row); if constexpr (place_string_to_arena) { return ArenaKeyHolder{key, pool}; @@ -136,7 +144,7 @@ struct HashMethodFixedString size_t n; const ColumnFixedString::Chars * chars; - bool column_is_const = false; + std::function get_key_holder_impl; HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { @@ -144,13 +152,15 @@ struct HashMethodFixedString const ColumnFixedString & column_string = assert_cast(column); n = column_string.getN(); chars = &column_string.getChars(); - column_is_const = isColumnConst(column); + if (isColumnConst(column)) + get_key_holder_impl = [this](size_t /*row*/) { return StringRef(&(*chars)[0], n); }; + else + get_key_holder_impl = [this](size_t row) { return StringRef(&(*chars)[row * n], n); }; } auto getKeyHolder(size_t row, [[maybe_unused]] Arena & pool) const { - size_t pos = column_is_const ? 0 : row; - StringRef key(&(*chars)[pos * n], n); + StringRef key = get_key_holder_impl(row); if constexpr (place_string_to_arena) { From ec49f7b3a3b19f34afa1ac413de492c50493b3d8 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Wed, 10 Aug 2022 19:28:08 +0800 Subject: [PATCH 101/164] fix style --- src/Common/ColumnsHashing.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index 6aef60fffc7..e89c894c427 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -6,8 +6,8 @@ #include #include #include -#include "Columns/IColumn.h" -#include "Core/Field.h" +#include +#include #include #include @@ -110,7 +110,7 @@ struct HashMethodString chars = column_string.getChars().data(); if (column_is_const) - get_key_holder_impl = [this](size_t /*row*/) { return StringRef(chars, offsets[0] - 1); }; + get_key_holder_impl = [this](size_t /*row*/) { return StringRef(chars, offsets[0] - 1); }; else get_key_holder_impl = [this](size_t row) { return StringRef(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1); }; } From 9ecf1c156a18e73e42b4932da772a7e0c205d99b Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 10 Aug 2022 11:32:24 +0000 Subject: [PATCH 102/164] Skip newlines before Tags in clickhouse-test --- tests/clickhouse-test | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 952fc7fb0a9..179060be96d 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1179,15 +1179,21 @@ class TestSuite: def is_shebang(line: str) -> bool: return line.startswith("#!") + def find_tag_line(file): + for line in file: + line = line.strip() + if line and not is_shebang(line): + return line + return '' + def load_tags_from_file(filepath): + comment_sign = get_comment_sign(filepath) with open(filepath, "r", encoding="utf-8") as file: try: - line = file.readline() - if is_shebang(line): - line = file.readline() + line = find_tag_line(file) except UnicodeDecodeError: return [] - return parse_tags_from_line(line, get_comment_sign(filepath)) + return parse_tags_from_line(line, comment_sign) all_tags = {} start_time = datetime.now() From c7615e9bde579de97c0f847d66ba9026d4dbd116 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Wed, 10 Aug 2022 12:28:28 +0000 Subject: [PATCH 103/164] fix style check --- src/Columns/ColumnArray.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index bf646139b41..93bcc3eb611 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -56,7 +56,7 @@ ColumnArray::ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr && /// This will also prevent possible overflow in offset. if (data->size() != last_offset) - throw Exception( ErrorCodes::LOGICAL_ERROR, + throw Exception(ErrorCodes::LOGICAL_ERROR, "offsets_column has data inconsistent with nested_column. Data size: {}, last offset: {}", data->size(), last_offset); } From 4b2bba2ff1aa9a878335e3c975e648992fe76400 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 10 Aug 2022 14:37:06 +0200 Subject: [PATCH 104/164] Do not upload unnecessary lambda sources --- .../build_and_deploy_archive.sh | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh index cd5b0d26e3f..defa400453f 100644 --- a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh +++ b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh @@ -7,18 +7,20 @@ cd "$WORKDIR" PY_EXEC=python3.9 LAMBDA_NAME=$(basename "$PWD") LAMBDA_NAME=${LAMBDA_NAME//_/-} -VENV=lambda-venv -rm -rf "$VENV" lambda-package.zip -"$PY_EXEC" -m venv "$VENV" -#virtualenv "$VENV" -# shellcheck disable=SC1091 -source "$VENV/bin/activate" -pip install -r requirements.txt PACKAGE=lambda-package rm -rf "$PACKAGE" "$PACKAGE".zip -cp -r "$VENV/lib/$PY_EXEC/site-packages" "$PACKAGE" +mkdir "$PACKAGE" cp app.py "$PACKAGE" -rm -r "$PACKAGE"/{pip,pip-*,setuptools,setuptools-*} -( cd "$PACKAGE" && zip -r ../"$PACKAGE".zip . ) +if [ -f requirements.txt ]; then + VENV=lambda-venv + rm -rf "$VENV" lambda-package.zip + "$PY_EXEC" -m venv "$VENV" + # shellcheck disable=SC1091 + source "$VENV/bin/activate" + pip install -r requirements.txt + cp -rT "$VENV/lib/$PY_EXEC/site-packages/" "$PACKAGE" + rm -r "$PACKAGE"/{pip,pip-*,setuptools,setuptools-*} +fi +( cd "$PACKAGE" && zip -9 -r ../"$PACKAGE".zip . ) aws lambda update-function-code --function-name "$LAMBDA_NAME" --zip-file fileb://"$PACKAGE".zip From 933043d84124e69f7fd2c632b37597f24fce5f07 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 10 Aug 2022 12:38:35 +0000 Subject: [PATCH 105/164] Minor fixes. --- src/Interpreters/AsynchronousMetrics.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index c1102a0652d..330afae8137 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -696,6 +696,8 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti Int64 peak = total_memory_tracker.getPeak(); Int64 rss = data.resident; + new_values["MemoryTrackingPeak"] = peak; + #if USE_JEMALLOC /// This is a memory which is kept by allocator. /// Remove it from RSS to decrease memory drift. @@ -710,7 +712,7 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti if (difference >= 1048576 || difference <= -1048576) { LOG_TRACE(log, - "MemoryTracking: allocated {}, peak {}, RSS {}, difference: {}", + "MemoryTracking: allocated {}, peak {}, RSS (adjusted) {}, difference: {}", ReadableSize(amount), ReadableSize(peak), ReadableSize(rss), From bf5e9173ad3c1351b13b69cab2790165d24528c8 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Wed, 10 Aug 2022 21:01:09 +0800 Subject: [PATCH 106/164] fix hashMethodOneNumber constructor --- src/Common/ColumnsHashing.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index e89c894c427..711c1c4096c 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -63,6 +63,15 @@ struct HashMethodOneNumber explicit HashMethodOneNumber(const IColumn * column) { vec = column->getRawData().data; + if (isColumnConst(*column)) + { + const_value = unalignedLoad(vec); + get_key_holder_impl = [this](size_t /*row*/) { return const_value; }; + } + else + { + get_key_holder_impl = [this](size_t row) { return unalignedLoad(vec + row * sizeof(FieldType)); }; + } } /// Creates context. Method is called once and result context is used in all threads. From 2fb68078e828256218c2bdf994b766de7f1a7185 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 10 Aug 2022 15:22:04 +0200 Subject: [PATCH 107/164] Replace S3 URLs by parameter --- tests/ci/ast_fuzzer_check.py | 3 ++- tests/ci/build_check.py | 9 ++++----- tests/ci/build_report_check.py | 3 ++- tests/ci/ccache_utils.py | 5 +++-- tests/ci/codebrowser_check.py | 11 +++++++---- tests/ci/compatibility_check.py | 4 ++-- tests/ci/docker_images_check.py | 4 ++-- tests/ci/docker_manifests_merge.py | 4 ++-- tests/ci/docker_server.py | 7 +++---- tests/ci/docs_check.py | 4 ++-- tests/ci/docs_release.py | 4 ++-- tests/ci/env_helper.py | 1 + tests/ci/fast_test_check.py | 4 ++-- tests/ci/functional_test_check.py | 6 +++--- tests/ci/integration_test_check.py | 4 ++-- tests/ci/keeper_jepsen_check.py | 6 +++--- tests/ci/performance_comparison_check.py | 6 +++--- tests/ci/push_to_artifactory.py | 4 ++-- tests/ci/s3_helper.py | 15 ++++----------- tests/ci/split_build_smoke_check.py | 4 ++-- tests/ci/stress_check.py | 6 +++--- tests/ci/style_check.py | 4 ++-- tests/ci/unit_tests_check.py | 4 ++-- 23 files changed, 60 insertions(+), 62 deletions(-) diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py index 918e27a4e11..82e7a3271c1 100644 --- a/tests/ci/ast_fuzzer_check.py +++ b/tests/ci/ast_fuzzer_check.py @@ -12,6 +12,7 @@ from env_helper import ( GITHUB_RUN_URL, REPORTS_PATH, REPO_COPY, + S3_URL, TEMP_PATH, ) from s3_helper import S3Helper @@ -117,7 +118,7 @@ if __name__ == "__main__": "core.gz": os.path.join(workspace_path, "core.gz"), } - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) for f in paths: try: paths[f] = s3_helper.upload_test_report_to_s3(paths[f], s3_prefix + "/" + f) diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 488fd1bbb34..2463ec669dd 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -15,6 +15,7 @@ from env_helper import ( IMAGES_PATH, REPO_COPY, S3_BUILDS_BUCKET, + S3_URL, TEMP_PATH, ) from s3_helper import S3Helper @@ -142,11 +143,9 @@ def check_for_success_run( for url in build_results: url_escaped = url.replace("+", "%2B").replace(" ", "%20") if BUILD_LOG_NAME in url: - log_url = f"https://s3.amazonaws.com/{S3_BUILDS_BUCKET}/{url_escaped}" + log_url = f"{S3_URL}/{S3_BUILDS_BUCKET}/{url_escaped}" else: - build_urls.append( - f"https://s3.amazonaws.com/{S3_BUILDS_BUCKET}/{url_escaped}" - ) + build_urls.append(f"{S3_URL}/{S3_BUILDS_BUCKET}/{url_escaped}") if not log_url: # log is uploaded the last, so if there's no log we need to rerun the build return @@ -250,7 +249,7 @@ def main(): logging.info("Repo copy path %s", REPO_COPY) - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) version = get_version_from_repo(git=Git(True)) release_or_pr, performance_pr = get_release_or_pr(pr_info, version) diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index f1f92cded1d..3155b4fd56d 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -14,6 +14,7 @@ from env_helper import ( GITHUB_RUN_URL, GITHUB_SERVER_URL, REPORTS_PATH, + S3_URL, TEMP_PATH, ) from report import create_build_html_report @@ -244,7 +245,7 @@ def main(): logging.error("No success builds, failing check") sys.exit(1) - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commits/master" branch_name = "master" diff --git a/tests/ci/ccache_utils.py b/tests/ci/ccache_utils.py index bd155b02cb4..fd3589e1bb3 100644 --- a/tests/ci/ccache_utils.py +++ b/tests/ci/ccache_utils.py @@ -7,9 +7,10 @@ import os import shutil from pathlib import Path -import requests +import requests # type: ignore from compress_files import decompress_fast, compress_fast +from env_helper import S3_URL, S3_BUILDS_BUCKET DOWNLOAD_RETRIES_COUNT = 5 @@ -73,7 +74,7 @@ def get_ccache_if_not_exists( for obj in objects: if ccache_name in obj: logging.info("Found ccache on path %s", obj) - url = "https://s3.amazonaws.com/clickhouse-builds/" + obj + url = f"{S3_URL}/{S3_BUILDS_BUCKET}/{obj}" compressed_cache = os.path.join(temp_path, os.path.basename(obj)) dowload_file_with_progress(url, compressed_cache) diff --git a/tests/ci/codebrowser_check.py b/tests/ci/codebrowser_check.py index 230a778c598..121339d9971 100644 --- a/tests/ci/codebrowser_check.py +++ b/tests/ci/codebrowser_check.py @@ -7,7 +7,7 @@ import logging from github import Github -from env_helper import IMAGES_PATH, REPO_COPY +from env_helper import IMAGES_PATH, REPO_COPY, S3_TEST_REPORTS_BUCKET, S3_URL from stopwatch import Stopwatch from upload_result_helper import upload_results from s3_helper import S3Helper @@ -23,7 +23,7 @@ def get_run_command(repo_path, output_path, image): cmd = ( "docker run " + f"--volume={repo_path}:/repo_folder " f"--volume={output_path}:/test_output " - f"-e 'DATA=https://s3.amazonaws.com/clickhouse-test-reports/codebrowser/data' {image}" + f"-e 'DATA={S3_URL}/{S3_TEST_REPORTS_BUCKET}/codebrowser/data' {image}" ) return cmd @@ -41,7 +41,7 @@ if __name__ == "__main__": os.makedirs(temp_path) docker_image = get_image_with_version(IMAGES_PATH, "clickhouse/codebrowser") - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) result_path = os.path.join(temp_path, "result_path") if not os.path.exists(result_path): @@ -69,7 +69,10 @@ if __name__ == "__main__": report_path, s3_path_prefix, "clickhouse-test-reports" ) - index_html = 'HTML report' + index_html = ( + '' + "HTML report" + ) test_results = [(index_html, "Look at the report")] diff --git a/tests/ci/compatibility_check.py b/tests/ci/compatibility_check.py index 2a1b9716189..fc7584536ef 100644 --- a/tests/ci/compatibility_check.py +++ b/tests/ci/compatibility_check.py @@ -8,7 +8,7 @@ import sys from github import Github -from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH +from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH, S3_URL from s3_helper import S3Helper from get_robot_token import get_best_robot_token from pr_info import PRInfo @@ -169,7 +169,7 @@ if __name__ == "__main__": subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) state, description, test_results, additional_logs = process_result( result_path, server_log_path ) diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index 8b838defa8b..76ebbb78c7b 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -14,7 +14,7 @@ from github import Github from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse from commit_status_helper import post_commit_status -from env_helper import GITHUB_WORKSPACE, RUNNER_TEMP, GITHUB_RUN_URL +from env_helper import GITHUB_WORKSPACE, RUNNER_TEMP, GITHUB_RUN_URL, S3_URL from get_robot_token import get_best_robot_token, get_parameter_from_ssm from pr_info import PRInfo from s3_helper import S3Helper @@ -460,7 +460,7 @@ def main(): with open(changed_json, "w", encoding="utf-8") as images_file: json.dump(result_images, images_file) - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) s3_path_prefix = ( str(pr_info.number) + "/" + pr_info.sha + "/" + NAME.lower().replace(" ", "_") diff --git a/tests/ci/docker_manifests_merge.py b/tests/ci/docker_manifests_merge.py index 00ab0b9e77f..78f236be786 100644 --- a/tests/ci/docker_manifests_merge.py +++ b/tests/ci/docker_manifests_merge.py @@ -11,7 +11,7 @@ from github import Github from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse from commit_status_helper import post_commit_status -from env_helper import RUNNER_TEMP +from env_helper import RUNNER_TEMP, S3_URL from get_robot_token import get_best_robot_token, get_parameter_from_ssm from pr_info import PRInfo from s3_helper import S3Helper @@ -203,7 +203,7 @@ def main(): json.dump(changed_images, ci) pr_info = PRInfo() - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) url = upload_results(s3_helper, pr_info.number, pr_info.sha, test_results, [], NAME) diff --git a/tests/ci/docker_server.py b/tests/ci/docker_server.py index 09a75206442..64172b90ebc 100644 --- a/tests/ci/docker_server.py +++ b/tests/ci/docker_server.py @@ -16,7 +16,7 @@ from build_check import get_release_or_pr from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse from commit_status_helper import post_commit_status from docker_images_check import DockerImage -from env_helper import CI, GITHUB_RUN_URL, RUNNER_TEMP, S3_BUILDS_BUCKET +from env_helper import CI, GITHUB_RUN_URL, RUNNER_TEMP, S3_BUILDS_BUCKET, S3_URL from get_robot_token import get_best_robot_token, get_parameter_from_ssm from git_helper import Git from pr_info import PRInfo @@ -309,8 +309,7 @@ def main(): pr_info = PRInfo() release_or_pr, _ = get_release_or_pr(pr_info, args.version) args.bucket_prefix = ( - f"https://s3.amazonaws.com/{S3_BUILDS_BUCKET}/" - f"{release_or_pr}/{pr_info.sha}" + f"{S3_URL}/{S3_BUILDS_BUCKET}/{release_or_pr}/{pr_info.sha}" ) if args.push: @@ -336,7 +335,7 @@ def main(): status = "failure" pr_info = pr_info or PRInfo() - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) url = upload_results(s3_helper, pr_info.number, pr_info.sha, test_results, [], NAME) diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py index cf4fd8da692..f260d1f1e50 100644 --- a/tests/ci/docs_check.py +++ b/tests/ci/docs_check.py @@ -6,7 +6,7 @@ import os import sys from github import Github -from env_helper import TEMP_PATH, REPO_COPY +from env_helper import TEMP_PATH, REPO_COPY, S3_URL from s3_helper import S3Helper from pr_info import PRInfo from get_robot_token import get_best_robot_token @@ -120,7 +120,7 @@ if __name__ == "__main__": else: lines.append(("Non zero exit code", "FAIL")) - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) ch_helper = ClickHouseHelper() report_url = upload_results( diff --git a/tests/ci/docs_release.py b/tests/ci/docs_release.py index 35203486fae..96b0e7048c6 100644 --- a/tests/ci/docs_release.py +++ b/tests/ci/docs_release.py @@ -7,7 +7,7 @@ import sys from github import Github -from env_helper import TEMP_PATH, REPO_COPY, CLOUDFLARE_TOKEN +from env_helper import TEMP_PATH, REPO_COPY, CLOUDFLARE_TOKEN, S3_URL from s3_helper import S3Helper from pr_info import PRInfo from get_robot_token import get_best_robot_token @@ -106,7 +106,7 @@ if __name__ == "__main__": else: lines.append(("Non zero exit code", "FAIL")) - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) report_url = upload_results( s3_helper, pr_info.number, pr_info.sha, lines, additional_files, NAME diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py index b37d38763be..b6541205ed3 100644 --- a/tests/ci/env_helper.py +++ b/tests/ci/env_helper.py @@ -22,6 +22,7 @@ IMAGES_PATH = os.getenv("IMAGES_PATH", TEMP_PATH) REPORTS_PATH = os.getenv("REPORTS_PATH", p.abspath(p.join(module_dir, "./reports"))) REPO_COPY = os.getenv("REPO_COPY", git_root) RUNNER_TEMP = os.getenv("RUNNER_TEMP", p.abspath(p.join(module_dir, "./tmp"))) +S3_URL = os.getenv("S3_URL", "https://s3.amazonaws.com") S3_BUILDS_BUCKET = os.getenv("S3_BUILDS_BUCKET", "clickhouse-builds") S3_TEST_REPORTS_BUCKET = os.getenv("S3_TEST_REPORTS_BUCKET", "clickhouse-test-reports") diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py index 9852175ca92..84d9d3f16d8 100644 --- a/tests/ci/fast_test_check.py +++ b/tests/ci/fast_test_check.py @@ -9,7 +9,7 @@ import atexit from github import Github -from env_helper import CACHES_PATH, TEMP_PATH +from env_helper import CACHES_PATH, TEMP_PATH, S3_URL from pr_info import FORCE_TESTS_LABEL, PRInfo from s3_helper import S3Helper from get_robot_token import get_best_robot_token @@ -105,7 +105,7 @@ if __name__ == "__main__": docker_image = get_image_with_version(temp_path, "clickhouse/fasttest") - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) workspace = os.path.join(temp_path, "fasttest-workspace") if not os.path.exists(workspace): diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py index 690ac3c1851..bcfeaa9973a 100644 --- a/tests/ci/functional_test_check.py +++ b/tests/ci/functional_test_check.py @@ -10,7 +10,7 @@ import atexit from github import Github -from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH +from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH, S3_URL from s3_helper import S3Helper from get_robot_token import get_best_robot_token from pr_info import FORCE_TESTS_LABEL, PRInfo @@ -88,7 +88,7 @@ def get_run_command( envs = [ f"-e MAX_RUN_TIME={int(0.9 * kill_timeout)}", - '-e S3_URL="https://clickhouse-datasets.s3.amazonaws.com"', + f'-e S3_URL="{S3_URL}/clickhouse-datasets"', ] if flaky_check: @@ -314,7 +314,7 @@ if __name__ == "__main__": subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) state, description, test_results, additional_logs = process_results( result_path, server_log_path diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py index 565864d576c..49a95748f6c 100644 --- a/tests/ci/integration_test_check.py +++ b/tests/ci/integration_test_check.py @@ -10,7 +10,7 @@ import sys from github import Github -from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH +from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH, S3_URL from s3_helper import S3Helper from get_robot_token import get_best_robot_token from pr_info import PRInfo @@ -249,7 +249,7 @@ if __name__ == "__main__": ch_helper = ClickHouseHelper() mark_flaky_tests(ch_helper, check_name, test_results) - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) report_url = upload_results( s3_helper, pr_info.number, diff --git a/tests/ci/keeper_jepsen_check.py b/tests/ci/keeper_jepsen_check.py index 88ccf8e8828..af44b87b897 100644 --- a/tests/ci/keeper_jepsen_check.py +++ b/tests/ci/keeper_jepsen_check.py @@ -9,7 +9,7 @@ import boto3 from github import Github import requests -from env_helper import REPO_COPY, TEMP_PATH +from env_helper import REPO_COPY, TEMP_PATH, S3_BUILDS_BUCKET, S3_URL from stopwatch import Stopwatch from upload_result_helper import upload_results from s3_helper import S3Helper @@ -192,7 +192,7 @@ if __name__ == "__main__": # run (see .github/workflows/jepsen.yml) So we cannot add explicit # dependency on a build job and using busy loop on it's results. For the # same reason we are using latest docker image. - build_url = f"https://s3.amazonaws.com/clickhouse-builds/{release_or_pr}/{pr_info.sha}/{build_name}/clickhouse" + build_url = f"{S3_URL}/{S3_BUILDS_BUCKET}/{release_or_pr}/{pr_info.sha}/{build_name}/clickhouse" head = requests.head(build_url) counter = 0 while head.status_code != 200: @@ -248,7 +248,7 @@ if __name__ == "__main__": description = "No Jepsen output log" test_result = [("No Jepsen output log", "FAIL")] - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) report_url = upload_results( s3_helper, pr_info.number, diff --git a/tests/ci/performance_comparison_check.py b/tests/ci/performance_comparison_check.py index 57a52dcaa6a..ce5226aeb04 100644 --- a/tests/ci/performance_comparison_check.py +++ b/tests/ci/performance_comparison_check.py @@ -15,7 +15,7 @@ from github import Github from commit_status_helper import get_commit, post_commit_status from ci_config import CI_CONFIG from docker_pull_helper import get_image_with_version -from env_helper import GITHUB_EVENT_PATH, GITHUB_RUN_URL +from env_helper import GITHUB_EVENT_PATH, GITHUB_RUN_URL, S3_BUILDS_BUCKET, S3_URL from get_robot_token import get_best_robot_token, get_parameter_from_ssm from pr_info import PRInfo from rerun_helper import RerunHelper @@ -86,7 +86,7 @@ if __name__ == "__main__": docker_env = "" - docker_env += " -e S3_URL=https://s3.amazonaws.com/clickhouse-builds" + docker_env += f" -e S3_URL={S3_URL}/{S3_BUILDS_BUCKET}" docker_env += f" -e BUILD_NAME={required_build}" if pr_info.number == 0: @@ -197,7 +197,7 @@ if __name__ == "__main__": } s3_prefix = f"{pr_info.number}/{pr_info.sha}/{check_name_prefix}/" - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) uploaded = {} # type: Dict[str, str] for name, path in paths.items(): try: diff --git a/tests/ci/push_to_artifactory.py b/tests/ci/push_to_artifactory.py index 98de315ddae..b04fa723580 100755 --- a/tests/ci/push_to_artifactory.py +++ b/tests/ci/push_to_artifactory.py @@ -9,7 +9,7 @@ from typing import Dict, List, Tuple from artifactory import ArtifactorySaaSPath # type: ignore from build_download_helper import dowload_build_with_progress -from env_helper import RUNNER_TEMP, S3_BUILDS_BUCKET +from env_helper import RUNNER_TEMP, S3_BUILDS_BUCKET, S3_URL from git_helper import TAG_REGEXP, commit, removeprefix, removesuffix @@ -98,7 +98,7 @@ class Packages: class S3: template = ( - "https://s3.amazonaws.com/" + f"{S3_URL}" # "clickhouse-builds/" f"{S3_BUILDS_BUCKET}/" # "33333/" or "21.11/" from --release, if pull request is omitted diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 91e67135f6f..483d6aee60e 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -9,7 +9,7 @@ from multiprocessing.dummy import Pool import boto3 # type: ignore -from env_helper import S3_TEST_REPORTS_BUCKET, S3_BUILDS_BUCKET, RUNNER_TEMP, CI +from env_helper import S3_TEST_REPORTS_BUCKET, S3_BUILDS_BUCKET, RUNNER_TEMP, CI, S3_URL from compress_files import compress_file_fast @@ -98,13 +98,8 @@ class S3Helper: logging.info("Upload %s to %s. Meta: %s", file_path, s3_path, metadata) # last two replacements are specifics of AWS urls: # https://jamesd3142.wordpress.com/2018/02/28/amazon-s3-and-the-plus-symbol/ - return ( - "https://s3.amazonaws.com/{bucket}/{path}".format( - bucket=bucket_name, path=s3_path - ) - .replace("+", "%2B") - .replace(" ", "%20") - ) + url = f"{S3_URL}/{bucket_name}/{s3_path}" + return url.replace("+", "%2B").replace(" ", "%20") def upload_test_report_to_s3(self, file_path, s3_path): if CI: @@ -175,9 +170,7 @@ class S3Helper: t = time.time() except Exception as ex: logging.critical("Failed to upload file, expcetion %s", ex) - return "https://s3.amazonaws.com/{bucket}/{path}".format( - bucket=bucket_name, path=s3_path - ) + return f"{S3_URL}/{bucket_name}/{s3_path}" p = Pool(256) diff --git a/tests/ci/split_build_smoke_check.py b/tests/ci/split_build_smoke_check.py index 87a528d2761..5052b6b362e 100644 --- a/tests/ci/split_build_smoke_check.py +++ b/tests/ci/split_build_smoke_check.py @@ -7,7 +7,7 @@ import sys from github import Github -from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH +from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH, S3_URL from s3_helper import S3Helper from get_robot_token import get_best_robot_token from pr_info import PRInfo @@ -126,7 +126,7 @@ if __name__ == "__main__": ) ch_helper = ClickHouseHelper() - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) report_url = upload_results( s3_helper, pr_info.number, diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py index e63f66e2e50..6073b03f8a6 100644 --- a/tests/ci/stress_check.py +++ b/tests/ci/stress_check.py @@ -8,7 +8,7 @@ import sys from github import Github -from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH +from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH, S3_URL from s3_helper import S3Helper from get_robot_token import get_best_robot_token from pr_info import PRInfo @@ -31,7 +31,7 @@ def get_run_command( ): cmd = ( "docker run --cap-add=SYS_PTRACE " - "-e S3_URL='https://clickhouse-datasets.s3.amazonaws.com' " + f"-e S3_URL='{S3_URL}/clickhouse-datasets' " f"--volume={build_path}:/package_folder " f"--volume={result_folder}:/test_output " f"--volume={repo_tests_path}:/usr/share/clickhouse-test " @@ -148,7 +148,7 @@ if __name__ == "__main__": subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) state, description, test_results, additional_logs = process_results( result_path, server_log_path, run_log_path ) diff --git a/tests/ci/style_check.py b/tests/ci/style_check.py index 66837ccb84e..db286ec7f6c 100644 --- a/tests/ci/style_check.py +++ b/tests/ci/style_check.py @@ -15,7 +15,7 @@ from clickhouse_helper import ( ) from commit_status_helper import post_commit_status, update_mergeable_check from docker_pull_helper import get_image_with_version -from env_helper import GITHUB_WORKSPACE, RUNNER_TEMP +from env_helper import GITHUB_WORKSPACE, RUNNER_TEMP, S3_URL from get_robot_token import get_best_robot_token from github_helper import GitHub from git_helper import git_runner @@ -166,7 +166,7 @@ if __name__ == "__main__": os.makedirs(temp_path) docker_image = get_image_with_version(temp_path, "clickhouse/style-test") - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) cmd = ( f"docker run -u $(id -u ${{USER}}):$(id -g ${{USER}}) --cap-add=SYS_PTRACE " diff --git a/tests/ci/unit_tests_check.py b/tests/ci/unit_tests_check.py index 4441709cb7b..95011b728e9 100644 --- a/tests/ci/unit_tests_check.py +++ b/tests/ci/unit_tests_check.py @@ -7,7 +7,7 @@ import subprocess from github import Github -from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH +from env_helper import TEMP_PATH, REPO_COPY, REPORTS_PATH, S3_URL from s3_helper import S3Helper from get_robot_token import get_best_robot_token from pr_info import PRInfo @@ -147,7 +147,7 @@ if __name__ == "__main__": subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) - s3_helper = S3Helper("https://s3.amazonaws.com") + s3_helper = S3Helper(S3_URL) state, description, test_results, additional_logs = process_result(test_output) ch_helper = ClickHouseHelper() From b7c5c54181ac60add1e4739b55266d201c621960 Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 10 Aug 2022 13:43:55 +0000 Subject: [PATCH 108/164] Fix build --- src/Interpreters/ActionsVisitor.cpp | 1 - .../QueryPlan/ReadFromMergeTree.cpp | 4 +-- src/Storages/Hive/StorageHive.cpp | 4 +-- src/Storages/MergeTree/KeyCondition.cpp | 29 +++++++++---------- src/Storages/MergeTree/KeyCondition.h | 11 +++++++ src/Storages/MergeTree/MergeTreeData.cpp | 4 +-- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 2 +- .../MergeTree/MergeTreeIndexMinMax.cpp | 2 +- src/Storages/MergeTree/PartitionPruner.h | 2 +- 9 files changed, 32 insertions(+), 27 deletions(-) diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index 9b9552f37cb..6c9e54a966d 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -945,7 +945,6 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & data.source_columns, std::make_shared(data.source_columns), data.prepared_sets, - data.subqueries_for_sets, data.no_subqueries, data.no_makeset, data.only_consts, diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 8af2f97909f..0d6f591b43a 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -905,11 +905,11 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead( nodes.nodes.push_back(&node); } - key_condition.emplace(std::move(nodes), query_info.syntax_analyzer_result, query_info.sets, context, primary_key_columns, primary_key.expression); + key_condition.emplace(std::move(nodes), query_info.syntax_analyzer_result, query_info.prepared_sets, context, primary_key_columns, primary_key.expression); } else { - key_condition.emplace(query_info.query, query_info.syntax_analyzer_result, query_info.sets, context, primary_key_columns, primary_key.expression); + key_condition.emplace(query_info, context, primary_key_columns, primary_key.expression); } if (settings.force_primary_key && key_condition->alwaysUnknownOrTrue()) diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index f6ac40838c4..01ee5a8c3c5 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -622,7 +622,7 @@ HiveFiles StorageHive::collectHiveFilesFromPartition( for (size_t i = 0; i < partition_names.size(); ++i) ranges.emplace_back(fields[i]); - const KeyCondition partition_key_condition(query_info.query, query_info.syntax_analyzer_result, query_info.sets, getContext(), partition_names, partition_minmax_idx_expr); + const KeyCondition partition_key_condition(query_info, getContext(), partition_names, partition_minmax_idx_expr); if (!partition_key_condition.checkInHyperrectangle(ranges, partition_types).can_be_true) return {}; } @@ -690,7 +690,7 @@ HiveFilePtr StorageHive::getHiveFileIfNeeded( if (prune_level >= PruneLevel::File) { - const KeyCondition hivefile_key_condition(query_info.query, query_info.syntax_analyzer_result, query_info.sets, getContext(), hivefile_name_types.getNames(), hivefile_minmax_idx_expr); + const KeyCondition hivefile_key_condition(query_info, getContext(), hivefile_name_types.getNames(), hivefile_minmax_idx_expr); if (hive_file->useFileMinMaxIndex()) { /// Load file level minmax index and apply diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index b8a16951269..7128558b734 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -279,14 +279,14 @@ public: } ConstSetPtr tryGetPreparedSet( - const PreparedSets & sets, + const PreparedSetsPtr & sets, const std::vector & indexes_mapping, const DataTypes & data_types) const { - if (ast) + if (sets && ast) { if (ast->as() || ast->as()) - return prepared_sets->get(PreparedSetKey::forSubquery(*set)); + return sets->get(PreparedSetKey::forSubquery(*ast)); /// We have `PreparedSetKey::forLiteral` but it is useless here as we don't have enough information /// about types in left argument of the IN operator. Instead, we manually iterate through all the sets @@ -303,26 +303,23 @@ public: return true; }; - for (const auto & set : prepared_sets->getByTreeHash(right_arg->getTreeHash())) + for (const auto & set : sets->getByTreeHash(ast->getTreeHash())) { if (types_match(set)) return set; } } - else + else if (dag->column) { - if (dag->column) - { - const IColumn * col = dag->column.get(); - if (const auto * col_const = typeid_cast(col)) - col = &col_const->getDataColumn(); + const IColumn * col = dag->column.get(); + if (const auto * col_const = typeid_cast(col)) + col = &col_const->getDataColumn(); - if (const auto * col_set = typeid_cast(col)) - { - auto set = col_set->getData(); - if (set->isCreated()) - return set; - } + if (const auto * col_set = typeid_cast(col)) + { + auto set = col_set->getData(); + if (set->isCreated()) + return set; } } diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h index 5fb1847c8d1..3c2089a56d7 100644 --- a/src/Storages/MergeTree/KeyCondition.h +++ b/src/Storages/MergeTree/KeyCondition.h @@ -216,6 +216,17 @@ public: bool single_point_ = false, bool strict_ = false); + KeyCondition( + const SelectQueryInfo & query_info, + ContextPtr context, + const Names & key_column_names, + const ExpressionActionsPtr & key_expr_, + bool single_point_ = false, + bool strict_ = false) + : KeyCondition(query_info.query, query_info.syntax_analyzer_result, query_info.prepared_sets, + context, key_column_names, key_expr_, single_point_, strict_) + {} + KeyCondition( ActionDAGNodes dag_nodes, TreeRewriterResultPtr syntax_analyzer_result, diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 377f5e6e0fc..594b4a32f9c 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5293,9 +5293,7 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( minmax_columns_types = getMinMaxColumnsTypes(partition_key); minmax_idx_condition.emplace( - query_info.query, query_info.syntax_analyzer_result, query_info.sets, - query_context, - minmax_columns_names, + query_info, query_context, minmax_columns_names, getMinMaxExpr(partition_key, ExpressionActionsSettings::fromContext(query_context))); partition_pruner.emplace(metadata_snapshot, query_info, query_context, false /* strict */); } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index c223e285c39..ba3505b5886 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -770,7 +770,7 @@ void MergeTreeDataSelectExecutor::filterPartsByPartition( minmax_columns_types = data.getMinMaxColumnsTypes(partition_key); minmax_idx_condition.emplace( - query_info.query, query_info.syntax_analyzer_result, query_info.sets, context, minmax_columns_names, data.getMinMaxExpr(partition_key, ExpressionActionsSettings::fromContext(context))); + query_info, context, minmax_columns_names, data.getMinMaxExpr(partition_key, ExpressionActionsSettings::fromContext(context))); partition_pruner.emplace(metadata_snapshot, query_info, context, false /* strict */); if (settings.force_index_by_date && (minmax_idx_condition->alwaysUnknownOrTrue() && partition_pruner->isUseless())) diff --git a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp index 80f3e140c41..05319ecc62e 100644 --- a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp @@ -161,7 +161,7 @@ MergeTreeIndexConditionMinMax::MergeTreeIndexConditionMinMax( const SelectQueryInfo & query, ContextPtr context) : index_data_types(index.data_types) - , condition(query.query, query.syntax_analyzer_result, query.sets, context, index.column_names, index.expression) + , condition(query, context, index.column_names, index.expression) { } diff --git a/src/Storages/MergeTree/PartitionPruner.h b/src/Storages/MergeTree/PartitionPruner.h index 3af52fd9a38..675fef1433d 100644 --- a/src/Storages/MergeTree/PartitionPruner.h +++ b/src/Storages/MergeTree/PartitionPruner.h @@ -27,7 +27,7 @@ public: PartitionPruner(const StorageMetadataPtr & metadata, const SelectQueryInfo & query_info, ContextPtr context, bool strict) : partition_key(MergeTreePartition::adjustPartitionKey(metadata, context)) , partition_condition( - query_info.query, query_info.syntax_analyzer_result, query_info.sets, + query_info.query, query_info.syntax_analyzer_result, query_info.prepared_sets, context, partition_key.column_names, partition_key.expression, true /* single_point */, strict) , useless(strict ? partition_condition.anyUnknownOrAlwaysTrue() : partition_condition.alwaysUnknownOrTrue()) { From 347ffbf178f6279fbfa2363525c1f8b638f4e6fe Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 10 Aug 2022 14:19:27 +0000 Subject: [PATCH 109/164] Fix special build --- contrib/arrow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/arrow b/contrib/arrow index b41ff445294..450a5638704 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit b41ff4452944d50a44ad9c6e4621b50f44e9742e +Subproject commit 450a5638704386356f8e520080468fc9bc8bcaf8 From 33473458735894068de84b96679805f88924b388 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 7 Jul 2022 12:26:34 +0000 Subject: [PATCH 110/164] Join with dictionary uses DirectJoin --- src/Interpreters/DictionaryJoinAdapter.cpp | 94 +++++++++++++++++++ src/Interpreters/DictionaryJoinAdapter.h | 34 +++++++ src/Interpreters/DirectJoin.cpp | 28 +++++- src/Interpreters/ExpressionAnalyzer.cpp | 80 +++++++++++++--- src/Interpreters/HashJoin.cpp | 34 +------ src/Interpreters/JoinedTables.cpp | 1 + src/Interpreters/TableJoin.cpp | 10 ++ src/Interpreters/TableJoin.h | 7 ++ src/Interpreters/join_common.cpp | 30 ++++++ src/Interpreters/join_common.h | 3 + .../01115_join_with_dictionary.sql | 2 + 11 files changed, 274 insertions(+), 49 deletions(-) create mode 100644 src/Interpreters/DictionaryJoinAdapter.cpp create mode 100644 src/Interpreters/DictionaryJoinAdapter.h diff --git a/src/Interpreters/DictionaryJoinAdapter.cpp b/src/Interpreters/DictionaryJoinAdapter.cpp new file mode 100644 index 00000000000..9866a25901a --- /dev/null +++ b/src/Interpreters/DictionaryJoinAdapter.cpp @@ -0,0 +1,94 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +DictionaryJoinAdapter::DictionaryJoinAdapter( + std::shared_ptr dictionary_, const Names & result_column_names) + : IKeyValueStorage(StorageID::createEmpty()) + , dictionary(dictionary_) +{ + if (!dictionary) + throw Exception("Dictionary is not initialized", ErrorCodes::LOGICAL_ERROR); + + const auto & key_types = dictionary->getStructure().getKeyTypes(); + const auto & key_names = dictionary->getStructure().getKeysNames(); + if (key_types.size() != key_names.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary '{}' has invalid structure", dictionary->getFullName()); + + StorageInMemoryMetadata storage_metadata; + + for (size_t i = 0; i < key_types.size(); ++i) + { + storage_metadata.columns.add(ColumnDescription(key_names[i], key_types[i])); + } + + for (const auto & attr_name : result_column_names) + { + const auto & attr = dictionary->getStructure().getAttribute(attr_name); + storage_metadata.columns.add(ColumnDescription(attr_name, attr.type)); + + attribute_names.emplace_back(attr_name); + result_types.emplace_back(attr.type); + } + + /// Fill in memory metadata to make getSampleBlock work. + setInMemoryMetadata(storage_metadata); +} + +Names DictionaryJoinAdapter::getPrimaryKey() const +{ + return dictionary->getStructure().getKeysNames(); +} + +Chunk DictionaryJoinAdapter::getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map) const +{ + if (keys.empty()) + return {}; + + Columns key_columns; + DataTypes key_types; + for (const auto & key : keys) + { + key_columns.emplace_back(key.column); + key_types.emplace_back(key.type); + } + + { + out_null_map.clear(); + + auto mask = dictionary->hasKeys(key_columns, key_types); + const auto & mask_data = mask->getData(); + + out_null_map.resize(mask_data.size(), 0); + std::copy(mask_data.begin(), mask_data.end(), out_null_map.begin()); + } + + Columns default_cols(result_types.size()); + for (size_t i = 0; i < result_types.size(); ++i) + /// Dictinonary may have non-standart default values specified + default_cols[i] = result_types[i]->createColumnConstWithDefaultValue(out_null_map.size()); + + Columns result_columns = dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_cols); + + for (const auto & key_col : key_columns) + { + /// Insert default values for keys that were not found + ColumnPtr filtered_key_col = JoinCommon::filterWithBlanks(key_col, out_null_map); + result_columns.insert(result_columns.begin(), filtered_key_col); + } + + size_t num_rows = result_columns[0]->size(); + return Chunk(std::move(result_columns), num_rows); +} + +} diff --git a/src/Interpreters/DictionaryJoinAdapter.h b/src/Interpreters/DictionaryJoinAdapter.h new file mode 100644 index 00000000000..adfa75a996b --- /dev/null +++ b/src/Interpreters/DictionaryJoinAdapter.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +/// Used in join with dictionary to provide sufficient interface to DirectJoin +class DictionaryJoinAdapter : public IKeyValueStorage +{ +public: + DictionaryJoinAdapter( + std::shared_ptr dictionary_, const Names & result_column_names); + + Names getPrimaryKey() const override; + + Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map) const override; + + std::string getName() const override + { + return dictionary->getFullName(); + } + +private: + std::shared_ptr dictionary; + + Strings attribute_names; + DataTypes result_types; +}; + +} diff --git a/src/Interpreters/DirectJoin.cpp b/src/Interpreters/DirectJoin.cpp index af6bd484753..ceacca2fdca 100644 --- a/src/Interpreters/DirectJoin.cpp +++ b/src/Interpreters/DirectJoin.cpp @@ -71,11 +71,18 @@ DirectKeyValueJoin::DirectKeyValueJoin(std::shared_ptr table_join_, throw DB::Exception(ErrorCodes::UNSUPPORTED_JOIN_KEYS, "Not supported by direct JOIN"); } - if (table_join->strictness() != JoinStrictness::All && - table_join->strictness() != JoinStrictness::Any && - table_join->strictness() != JoinStrictness::RightAny) + bool allowed_inner = isInner(table_join->kind()) && (table_join->strictness() == ASTTableJoin::Strictness::All || + table_join->strictness() == ASTTableJoin::Strictness::Any || + table_join->strictness() != JoinStrictness::RightAny); + + bool allowed_left = isLeft(table_join->kind()) && (table_join->strictness() == ASTTableJoin::Strictness::Any || + table_join->strictness() == ASTTableJoin::Strictness::All || + table_join->strictness() == ASTTableJoin::Strictness::Semi || + table_join->strictness() == ASTTableJoin::Strictness::Anti); + if (!allowed_inner && !allowed_left) { - throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "Not supported by direct JOIN"); + throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "Strictness {} and kind {} is not supported by direct JOIN", + table_join->strictness(), table_join->kind()); } LOG_TRACE(log, "Using direct join"); @@ -116,7 +123,18 @@ void DirectKeyValueJoin::joinBlock(Block & block, std::shared_ptr &) block.insert(std::move(col)); } - if (!isLeftOrFull(table_join->kind())) + bool is_semi_join = table_join->strictness() == ASTTableJoin::Strictness::Semi; + bool is_anti_join = table_join->strictness() == ASTTableJoin::Strictness::Anti; + + if (is_anti_join) + { + /// invert null_map + for (auto & val : null_map) + val = !val; + } + + /// Filter non joined rows + if (isInner(table_join->kind()) || (isLeft(table_join->kind()) && (is_semi_join || is_anti_join))) { MutableColumns dst_columns = block.mutateColumns(); for (auto & col : dst_columns) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 4f59a45b628..a6c91f7ace3 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -40,7 +41,7 @@ #include #include #include - +#include #include @@ -1014,19 +1015,15 @@ static std::shared_ptr chooseJoinAlgorithm(std::shared_ptr ana if (analyzed_join->isEnabledAlgorithm(JoinAlgorithm::DIRECT)) { - if (JoinPtr kvjoin = tryKeyValueJoin(analyzed_join, right_sample_block)) - { - /// Do not need to execute plan for right part - joined_plan.reset(); - return kvjoin; - } + JoinPtr direct_join = nullptr; + direct_join = direct_join ? direct_join : tryKeyValueJoin(analyzed_join, right_sample_block); + direct_join = direct_join ? direct_join : tryDictJoin(analyzed_join, right_sample_block, getContext()); - /// It's not a hash join actually, that's why we check JoinAlgorithm::DIRECT - /// It's would be fixed in https://github.com/ClickHouse/ClickHouse/pull/38956 - if (analyzed_join->tryInitDictJoin(right_sample_block, context)) + if (direct_join) { + /// Do not need to execute plan for right part, it's ready. joined_plan.reset(); - return std::make_shared(analyzed_join, right_sample_block); + return direct_join; } } @@ -1113,6 +1110,66 @@ static std::unique_ptr buildJoinedPlan( return joined_plan; } +std::shared_ptr tryDictJoin(std::shared_ptr analyzed_join, const Block & right_sample_block, ContextPtr context) +{ + using Strictness = ASTTableJoin::Strictness; + + bool allowed_inner = isInner(analyzed_join->kind()) && analyzed_join->strictness() == Strictness::All; + bool allowed_left = isLeft(analyzed_join->kind()) && (analyzed_join->strictness() == Strictness::Any || + analyzed_join->strictness() == Strictness::All || + analyzed_join->strictness() == Strictness::Semi || + analyzed_join->strictness() == Strictness::Anti); + if (!allowed_inner && !allowed_left) + { + LOG_TRACE(&Poco::Logger::get("tryDictJoin"), "Can't use dictionary join: {} {} is not supported", + analyzed_join->kind(), analyzed_join->strictness()); + return nullptr; + } + + if (analyzed_join->getClauses().size() != 1 || analyzed_join->getClauses()[0].key_names_right.size() != 1) + { + LOG_TRACE(&Poco::Logger::get("tryDictJoin"), "Can't use dictionary join: only one key is supported"); + return nullptr; + } + + const auto & right_key = analyzed_join->getOnlyClause().key_names_right[0]; + + const auto & dictionary_name = analyzed_join->getRightStorageName(); + if (dictionary_name.empty()) + { + LOG_TRACE(&Poco::Logger::get("tryDictJoin"), "Can't use dictionary join: dictionary was not found"); + return nullptr; + } + + FunctionDictHelper dictionary_helper(context); + + auto dictionary = dictionary_helper.getDictionary(dictionary_name); + if (!dictionary) + { + LOG_TRACE(&Poco::Logger::get("tryDictJoin"), "Can't use dictionary join: dictionary was not found"); + return nullptr; + } + + const auto & dict_keys = dictionary->getStructure().getKeysNames(); + if (dict_keys.size() != 1 || dict_keys[0] != analyzed_join->getOriginalName(right_key)) + { + LOG_TRACE(&Poco::Logger::get("tryDictJoin"), "Can't use dictionary join: join key '{}' doesn't natch to dictionary key ({})", + right_key, fmt::join(dict_keys, ", ")); + return nullptr; + } + + Names attr_names; + for (const auto & col : right_sample_block) + { + if (col.name == right_key) + continue; + attr_names.push_back(analyzed_join->getOriginalName(col.name)); + } + + auto dict_reader = std::make_shared(dictionary, attr_names); + return std::make_shared(analyzed_join, right_sample_block, dict_reader); +} + std::shared_ptr tryKeyValueJoin(std::shared_ptr analyzed_join, const Block & right_sample_block) { if (!analyzed_join->isEnabledAlgorithm(JoinAlgorithm::DIRECT)) @@ -1192,7 +1249,6 @@ JoinPtr SelectQueryExpressionAnalyzer::makeJoin( } JoinPtr join = chooseJoinAlgorithm(analyzed_join, joined_plan, getContext()); - return join; } diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 95a0008c257..55a18cb4193 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -179,36 +179,6 @@ namespace JoinStuff } } -static ColumnPtr filterWithBlanks(ColumnPtr src_column, const IColumn::Filter & filter, bool inverse_filter = false) -{ - ColumnPtr column = src_column->convertToFullColumnIfConst(); - MutableColumnPtr mut_column = column->cloneEmpty(); - mut_column->reserve(column->size()); - - if (inverse_filter) - { - for (size_t row = 0; row < filter.size(); ++row) - { - if (filter[row]) - mut_column->insertDefault(); - else - mut_column->insertFrom(*column, row); - } - } - else - { - for (size_t row = 0; row < filter.size(); ++row) - { - if (filter[row]) - mut_column->insertFrom(*column, row); - else - mut_column->insertDefault(); - } - } - - return mut_column; -} - static ColumnWithTypeAndName correctNullability(ColumnWithTypeAndName && column, bool nullable) { if (nullable) @@ -220,7 +190,7 @@ static ColumnWithTypeAndName correctNullability(ColumnWithTypeAndName && column, /// We have to replace values masked by NULLs with defaults. if (column.column) if (const auto * nullable_column = checkAndGetColumn(*column.column)) - column.column = filterWithBlanks(column.column, nullable_column->getNullMapColumn().getData(), true); + column.column = JoinCommon::filterWithBlanks(column.column, nullable_column->getNullMapColumn().getData(), true); JoinCommon::removeColumnNullability(column); } @@ -1607,7 +1577,7 @@ void HashJoin::joinBlockImpl( const auto & col = block.getByName(left_name); bool is_nullable = JoinCommon::isNullable(right_key.type); - ColumnPtr thin_column = filterWithBlanks(col.column, filter); + ColumnPtr thin_column = JoinCommon::filterWithBlanks(col.column, filter); ColumnWithTypeAndName right_col(thin_column, col.type, right_col_name); if (right_col.type->lowCardinality() != right_key.type->lowCardinality()) diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp index 74b54ef537f..9caaad7faa7 100644 --- a/src/Interpreters/JoinedTables.cpp +++ b/src/Interpreters/JoinedTables.cpp @@ -323,6 +323,7 @@ std::shared_ptr JoinedTables::makeTableJoin(const ASTSelectQuery & se else if (auto storage_dict = std::dynamic_pointer_cast(storage); storage_dict && join_algorithm.isSet(JoinAlgorithm::DIRECT)) { + table_join->setRightStorageName(storage_dict->getDictionaryName()); table_join->setStorageJoin(storage_dict); } else if (auto storage_kv = std::dynamic_pointer_cast(storage); diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 5d402ca4127..53eeff49437 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -727,6 +727,16 @@ void TableJoin::setStorageJoin(std::shared_ptr storage) right_storage_join = storage; } +void TableJoin::setRightStorageName(const std::string & storage_name) +{ + right_storage_name = storage_name; +} + +const std::string & TableJoin::getRightStorageName() const +{ + return right_storage_name; +} + void TableJoin::setStorageJoin(std::shared_ptr storage) { if (right_storage_join) diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 3bb3b00416c..4e0706712a4 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -29,6 +29,7 @@ class ASTSelectQuery; struct DatabaseAndTableWithAlias; class Block; class DictionaryReader; +class DictionaryJoinAdapter; class StorageJoin; class StorageDictionary; class IKeyValueStorage; @@ -145,6 +146,8 @@ private: std::shared_ptr right_kv_storage; + std::string right_storage_name; + Names requiredJoinedNames() const; /// Create converting actions and change key column names if required @@ -301,6 +304,10 @@ public: std::unordered_map leftToRightKeyRemap() const; + /// Remember storage name in case of joining with dictionary or another special storage + void setRightStorageName(const std::string & storage_name); + const std::string & getRightStorageName() const; + void setStorageJoin(std::shared_ptr storage); void setStorageJoin(std::shared_ptr storage); void setStorageJoin(std::shared_ptr storage); diff --git a/src/Interpreters/join_common.cpp b/src/Interpreters/join_common.cpp index c81f4a193c3..ba71445df29 100644 --- a/src/Interpreters/join_common.cpp +++ b/src/Interpreters/join_common.cpp @@ -573,6 +573,36 @@ void splitAdditionalColumns(const Names & key_names, const Block & sample_block, } } +ColumnPtr filterWithBlanks(ColumnPtr src_column, const IColumn::Filter & filter, bool inverse_filter) +{ + ColumnPtr column = src_column->convertToFullColumnIfConst(); + MutableColumnPtr mut_column = column->cloneEmpty(); + mut_column->reserve(column->size()); + + if (inverse_filter) + { + for (size_t row = 0; row < filter.size(); ++row) + { + if (filter[row]) + mut_column->insertDefault(); + else + mut_column->insertFrom(*column, row); + } + } + else + { + for (size_t row = 0; row < filter.size(); ++row) + { + if (filter[row]) + mut_column->insertFrom(*column, row); + else + mut_column->insertDefault(); + } + } + + return mut_column; +} + } NotJoinedBlocks::NotJoinedBlocks(std::unique_ptr filler_, diff --git a/src/Interpreters/join_common.h b/src/Interpreters/join_common.h index 38b431db3e0..2e26ab782a1 100644 --- a/src/Interpreters/join_common.h +++ b/src/Interpreters/join_common.h @@ -106,6 +106,9 @@ void splitAdditionalColumns(const Names & key_names, const Block & sample_block, void changeLowCardinalityInplace(ColumnWithTypeAndName & column); +/// Insert default values for rows marked in filter +ColumnPtr filterWithBlanks(ColumnPtr src_column, const IColumn::Filter & filter, bool inverse_filter = false); + } /// Creates result from right table data in RIGHT and FULL JOIN when keys are not present in left table. diff --git a/tests/queries/0_stateless/01115_join_with_dictionary.sql b/tests/queries/0_stateless/01115_join_with_dictionary.sql index 5fbfe283fea..2b38abdec0e 100644 --- a/tests/queries/0_stateless/01115_join_with_dictionary.sql +++ b/tests/queries/0_stateless/01115_join_with_dictionary.sql @@ -1,5 +1,7 @@ SET send_logs_level = 'fatal'; +DROP TABLE IF EXISTS t1; + DROP DICTIONARY IF EXISTS dict_flat; DROP DICTIONARY IF EXISTS dict_hashed; DROP DICTIONARY IF EXISTS dict_complex_cache; From 7073067d40236e8095b5876d5b86d9b561eb845e Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 7 Jul 2022 14:53:39 +0000 Subject: [PATCH 111/164] check attributes for join with dict --- src/Dictionaries/DictionaryStructure.cpp | 6 ++++++ src/Dictionaries/DictionaryStructure.h | 1 + src/Interpreters/DictionaryJoinAdapter.cpp | 6 ++++++ src/Interpreters/DirectJoin.cpp | 7 ++++++- src/Interpreters/ExpressionAnalyzer.cpp | 5 ++++- 5 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/Dictionaries/DictionaryStructure.cpp b/src/Dictionaries/DictionaryStructure.cpp index 5624f9595d7..3ba82164eb2 100644 --- a/src/Dictionaries/DictionaryStructure.cpp +++ b/src/Dictionaries/DictionaryStructure.cpp @@ -167,6 +167,12 @@ void DictionaryStructure::validateKeyTypes(const DataTypes & key_types) const } } +bool DictionaryStructure::hasAttribute(const std::string & attribute_name) const +{ + auto it = attribute_name_to_index.find(attribute_name); + return it != attribute_name_to_index.end(); +} + const DictionaryAttribute & DictionaryStructure::getAttribute(const std::string & attribute_name) const { auto it = attribute_name_to_index.find(attribute_name); diff --git a/src/Dictionaries/DictionaryStructure.h b/src/Dictionaries/DictionaryStructure.h index bb4c306affa..327606e97f7 100644 --- a/src/Dictionaries/DictionaryStructure.h +++ b/src/Dictionaries/DictionaryStructure.h @@ -127,6 +127,7 @@ struct DictionaryStructure final DataTypes getKeyTypes() const; void validateKeyTypes(const DataTypes & key_types) const; + bool hasAttribute(const std::string & attribute_name) const; const DictionaryAttribute & getAttribute(const std::string & attribute_name) const; const DictionaryAttribute & getAttribute(const std::string & attribute_name, const DataTypePtr & type) const; diff --git a/src/Interpreters/DictionaryJoinAdapter.cpp b/src/Interpreters/DictionaryJoinAdapter.cpp index 9866a25901a..e3c7deb3963 100644 --- a/src/Interpreters/DictionaryJoinAdapter.cpp +++ b/src/Interpreters/DictionaryJoinAdapter.cpp @@ -12,6 +12,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + DictionaryJoinAdapter::DictionaryJoinAdapter( std::shared_ptr dictionary_, const Names & result_column_names) : IKeyValueStorage(StorageID::createEmpty()) @@ -78,6 +83,7 @@ Chunk DictionaryJoinAdapter::getByKeys(const ColumnsWithTypeAndName & keys, Padd /// Dictinonary may have non-standart default values specified default_cols[i] = result_types[i]->createColumnConstWithDefaultValue(out_null_map.size()); + /// Result block consists of key columns and then attributes Columns result_columns = dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_cols); for (const auto & key_col : key_columns) diff --git a/src/Interpreters/DirectJoin.cpp b/src/Interpreters/DirectJoin.cpp index ceacca2fdca..46ed45f7708 100644 --- a/src/Interpreters/DirectJoin.cpp +++ b/src/Interpreters/DirectJoin.cpp @@ -30,6 +30,11 @@ static MutableColumns convertBlockStructure( MutableColumns result_columns; for (const auto & out_sample_col : result_sample_block) { + /// Some coulumns from result_sample_block may not be in source_sample_block, + /// e.g. if they will be calculated later based on joined columns + if (!source_sample_block.has(out_sample_col.name)) + continue; + auto i = source_sample_block.getPositionByName(out_sample_col.name); if (columns[i] == nullptr) { @@ -111,7 +116,7 @@ void DirectKeyValueJoin::joinBlock(Block & block, std::shared_ptr &) NullMap null_map; Chunk joined_chunk = storage->getByKeys({key_col}, null_map); - /// Expected right block may differ from structure in storage, because of `join_use_nulls` or we just select not all columns. + /// Expected right block may differ from structure in storage, because of `join_use_nulls` or we just select not all joined attributes Block original_right_block = originalRightBlock(right_sample_block, *table_join); Block sample_storage_block = storage->getInMemoryMetadataPtr()->getSampleBlock(); MutableColumns result_columns = convertBlockStructure(sample_storage_block, original_right_block, joined_chunk.mutateColumns(), null_map); diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index a6c91f7ace3..4cb90c2218a 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -1163,7 +1163,10 @@ std::shared_ptr tryDictJoin(std::shared_ptr analy { if (col.name == right_key) continue; - attr_names.push_back(analyzed_join->getOriginalName(col.name)); + + const auto & original_name = analyzed_join->getOriginalName(col.name); + if (dictionary->getStructure().hasAttribute(original_name)) + attr_names.push_back(original_name); } auto dict_reader = std::make_shared(dictionary, attr_names); From d1aea199875ffd4a5b8a7225b1c0ce01460ce482 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 8 Jul 2022 13:11:27 +0000 Subject: [PATCH 112/164] Remove old join with dictionary --- src/Interpreters/DictionaryReader.cpp | 172 ------------------------ src/Interpreters/DictionaryReader.h | 50 ------- src/Interpreters/ExpressionAnalyzer.cpp | 1 - src/Interpreters/HashJoin.cpp | 123 ++--------------- src/Interpreters/HashJoin.h | 14 +- src/Interpreters/JoinedTables.cpp | 1 - src/Interpreters/TableJoin.cpp | 12 +- src/Interpreters/TableJoin.h | 10 +- 8 files changed, 13 insertions(+), 370 deletions(-) delete mode 100644 src/Interpreters/DictionaryReader.cpp delete mode 100644 src/Interpreters/DictionaryReader.h diff --git a/src/Interpreters/DictionaryReader.cpp b/src/Interpreters/DictionaryReader.cpp deleted file mode 100644 index 3c66b0019ed..00000000000 --- a/src/Interpreters/DictionaryReader.cpp +++ /dev/null @@ -1,172 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; - extern const int TYPE_MISMATCH; -} - - -DictionaryReader::FunctionWrapper::FunctionWrapper(FunctionOverloadResolverPtr resolver, const ColumnsWithTypeAndName & arguments, - Block & block, const ColumnNumbers & arg_positions_, const String & column_name, - TypeIndex expected_type) - : arg_positions(arg_positions_) - , result_pos(block.columns()) -{ - FunctionBasePtr prepared_function = resolver->build(arguments); - - ColumnWithTypeAndName result; - result.name = "get_" + column_name; - result.type = prepared_function->getResultType(); - if (result.type->getTypeId() != expected_type) - throw Exception("Type mismatch in dictionary reader for: " + column_name, ErrorCodes::TYPE_MISMATCH); - block.insert(result); - - ColumnsWithTypeAndName args; - args.reserve(arg_positions.size()); - for (auto pos : arg_positions) - args.emplace_back(block.getByPosition(pos)); - - function = prepared_function->prepare(block.getColumnsWithTypeAndName()); -} - -static constexpr const size_t key_size = 1; - -DictionaryReader::DictionaryReader(const String & dictionary_name, const Names & src_column_names, const NamesAndTypesList & result_columns, - ContextPtr context) - : result_header(makeResultBlock(result_columns)) - , key_position(key_size + result_header.columns()) -{ - if (src_column_names.size() != result_columns.size()) - throw Exception("Columns number mismatch in dictionary reader", ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH); - - ColumnWithTypeAndName dict_name; - ColumnWithTypeAndName key; - ColumnWithTypeAndName column_name; - - { - dict_name.name = "dict"; - dict_name.type = std::make_shared(); - dict_name.column = dict_name.type->createColumnConst(1, dictionary_name); - - /// TODO: composite key (key_size > 1) - key.name = "key"; - key.type = std::make_shared(); - - column_name.name = "column"; - column_name.type = std::make_shared(); - } - - /// dictHas('dict_name', id) - ColumnsWithTypeAndName arguments_has; - arguments_has.push_back(dict_name); - arguments_has.push_back(key); - - /// dictGet('dict_name', 'attr_name', id) - ColumnsWithTypeAndName arguments_get; - arguments_get.push_back(dict_name); - arguments_get.push_back(column_name); - arguments_get.push_back(key); - - sample_block.insert(dict_name); - - for (const auto & columns_name : src_column_names) - { - ColumnWithTypeAndName name; - name.name = "col_" + columns_name; - name.type = std::make_shared(); - name.column = name.type->createColumnConst(1, columns_name); - - sample_block.insert(name); - } - - sample_block.insert(key); - - ColumnNumbers positions_has{0, key_position}; - function_has = std::make_unique(FunctionFactory::instance().get("dictHas", context), - arguments_has, sample_block, positions_has, "has", DataTypeUInt8().getTypeId()); - functions_get.reserve(result_header.columns()); - - for (size_t i = 0; i < result_header.columns(); ++i) - { - size_t column_name_pos = key_size + i; - auto & column = result_header.getByPosition(i); - arguments_get[1].column = DataTypeString().createColumnConst(1, src_column_names[i]); - ColumnNumbers positions_get{0, column_name_pos, key_position}; - functions_get.emplace_back( - FunctionWrapper(FunctionFactory::instance().get("dictGet", context), - arguments_get, sample_block, positions_get, column.name, column.type->getTypeId())); - } -} - -void DictionaryReader::readKeys(const IColumn & keys, Block & out_block, ColumnVector::Container & found, - std::vector & positions) const -{ - auto working_block = sample_block.getColumnsWithTypeAndName(); - size_t has_position = key_position + 1; - size_t size = keys.size(); - - /// set keys for dictHas() - ColumnWithTypeAndName & key_column = working_block[key_position]; - key_column.column = keys.cloneResized(size); /// just a copy we cannot avoid - - /// calculate and extract dictHas() - function_has->execute(working_block, size); - ColumnWithTypeAndName & has_column = working_block[has_position]; - auto mutable_has = IColumn::mutate(std::move(has_column.column)); - found.swap(typeid_cast &>(*mutable_has).getData()); - has_column.column = nullptr; - - /// set mapping from source keys to resulting rows in output block - positions.clear(); - positions.resize(size, 0); - size_t pos = 0; - for (size_t i = 0; i < size; ++i) - if (found[i]) - positions[i] = pos++; - - /// set keys for dictGet(): remove not found keys - key_column.column = key_column.column->filter(found, -1); - size_t rows = key_column.column->size(); - - /// calculate dictGet() - for (const auto & func : functions_get) - func.execute(working_block, rows); - - /// make result: copy header block with correct names and move data columns - out_block = result_header.cloneEmpty(); - size_t first_get_position = has_position + 1; - for (size_t i = 0; i < out_block.columns(); ++i) - { - auto & src_column = working_block[first_get_position + i]; - auto & dst_column = out_block.getByPosition(i); - dst_column.column = src_column.column; - src_column.column = nullptr; - } -} - -Block DictionaryReader::makeResultBlock(const NamesAndTypesList & names) -{ - Block block; - for (const auto & nm : names) - { - ColumnWithTypeAndName column{nullptr, nm.type, nm.name}; - if (column.type->isNullable()) - column.type = typeid_cast(*column.type).getNestedType(); - block.insert(std::move(column)); - } - return block; -} - -} diff --git a/src/Interpreters/DictionaryReader.h b/src/Interpreters/DictionaryReader.h deleted file mode 100644 index bfb21e2f050..00000000000 --- a/src/Interpreters/DictionaryReader.h +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace DB -{ - -/// Read block of required columns from Dictionary by UInt64 key column. Rename columns if needed. -/// Current implementation uses dictHas() + N * dictGet() functions. -class DictionaryReader -{ -public: - struct FunctionWrapper - { - ExecutableFunctionPtr function; - ColumnNumbers arg_positions; - size_t result_pos = 0; - - FunctionWrapper(FunctionOverloadResolverPtr resolver, const ColumnsWithTypeAndName & arguments, Block & block, - const ColumnNumbers & arg_positions_, const String & column_name, TypeIndex expected_type); - - void execute(ColumnsWithTypeAndName & columns, size_t rows) const - { - ColumnsWithTypeAndName args; - args.reserve(arg_positions.size()); - for (auto pos : arg_positions) - args.emplace_back(columns[pos]); - - columns[result_pos].column = function->execute(args, columns[result_pos].type, rows, false); - } - }; - - DictionaryReader(const String & dictionary_name, const Names & src_column_names, const NamesAndTypesList & result_columns, - ContextPtr context); - void readKeys(const IColumn & keys, Block & out_block, ColumnVector::Container & found, std::vector & positions) const; - -private: - Block result_header; - Block sample_block; /// dictionary name, column names, key, dictHas() result, dictGet() results - size_t key_position; - std::unique_ptr function_has; - std::vector functions_get; - - static Block makeResultBlock(const NamesAndTypesList & names); -}; - -} diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 4cb90c2218a..b466fbbfcc1 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 55a18cb4193..a948769540a 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -19,9 +19,7 @@ #include #include #include -#include -#include #include #include @@ -268,16 +266,7 @@ HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_s const auto & key_names_right = clause.key_names_right; ColumnRawPtrs key_columns = JoinCommon::extractKeysForJoin(right_table_keys, key_names_right); - if (table_join->getDictionaryReader()) - { - assert(disjuncts_num == 1); - data->type = Type::DICT; - - data->maps.resize(disjuncts_num); - std::get(data->maps[0]).create(Type::DICT); - chooseMethod(kind, key_columns, key_sizes.emplace_back()); /// init key_sizes - } - else if (strictness == JoinStrictness::Asof) + if (strictness == ASTTableJoin::Strictness::Asof) { assert(disjuncts_num == 1); @@ -399,36 +388,6 @@ static KeyGetter createKeyGetter(const ColumnRawPtrs & key_columns, const Sizes template using FindResultImpl = ColumnsHashing::columns_hashing_impl::FindResultImpl; -class KeyGetterForDict -{ -public: - using Mapped = RowRef; - using FindResult = FindResultImpl; - - KeyGetterForDict(const TableJoin & table_join, const ColumnRawPtrs & key_columns) - { - assert(table_join.getDictionaryReader()); - table_join.getDictionaryReader()->readKeys(*key_columns[0], read_result, found, positions); - - for (ColumnWithTypeAndName & column : read_result) - if (table_join.rightBecomeNullable(column.type)) - JoinCommon::convertColumnToNullable(column); - } - - FindResult findKey(const TableJoin &, size_t row, const Arena &) - { - result.block = &read_result; - result.row_num = positions[row]; - return FindResult(&result, found[row], 0); - } - -private: - Block read_result; - Mapped result; - ColumnVector::Container found; - std::vector positions; -}; - /// Dummy key getter, always find nothing, used for JOIN ON NULL template class KeyGetterEmpty @@ -499,20 +458,13 @@ struct KeyGetterForType void HashJoin::dataMapInit(MapsVariant & map) { - if (data->type == Type::DICT) - return; - if (kind == JoinKind::Cross) + + if (kind == ASTTableJoin::Kind::Cross) return; joinDispatchInit(kind, strictness, map); joinDispatch(kind, strictness, map, [&](auto, auto, auto & map_) { map_.create(data->type); }); } -bool HashJoin::overDictionary() const -{ - assert(data->type != Type::DICT || table_join->getDictionaryReader()); - return data->type == Type::DICT; -} - bool HashJoin::empty() const { return data->type == Type::EMPTY; @@ -520,7 +472,7 @@ bool HashJoin::empty() const bool HashJoin::alwaysReturnsEmptySet() const { - return isInnerOrRight(getKind()) && data->empty && !overDictionary(); + return isInnerOrRight(getKind()) && data->empty; } size_t HashJoin::getTotalRowCount() const @@ -532,7 +484,7 @@ size_t HashJoin::getTotalRowCount() const for (const auto & block : data->blocks) res += block.rows(); } - else if (data->type != Type::DICT) + else { for (const auto & map : data->maps) { @@ -540,6 +492,7 @@ size_t HashJoin::getTotalRowCount() const } } + return res; } @@ -552,7 +505,7 @@ size_t HashJoin::getTotalByteCount() const for (const auto & block : data->blocks) res += block.bytes(); } - else if (data->type != Type::DICT) + else { for (const auto & map : data->maps) { @@ -663,7 +616,6 @@ namespace { case HashJoin::Type::EMPTY: return 0; case HashJoin::Type::CROSS: return 0; /// Do nothing. We have already saved block, and it is enough. - case HashJoin::Type::DICT: return 0; /// No one should call it with Type::DICT. #define M(TYPE) \ case HashJoin::Type::TYPE: \ @@ -723,9 +675,6 @@ Block HashJoin::structureRightBlock(const Block & block) const bool HashJoin::addJoinedBlock(const Block & source_block, bool check_limits) { - if (overDictionary()) - throw Exception("Logical error: insert into hash-map in HashJoin over dictionary", ErrorCodes::LOGICAL_ERROR); - /// RowRef::SizeT is uint32_t (not size_t) for hash table Cell memory efficiency. /// It's possible to split bigger blocks and insert them by parts here. But it would be a dead code. if (unlikely(source_block.rows() > std::numeric_limits::max())) @@ -1446,28 +1395,6 @@ IColumn::Filter switchJoinRightColumns( } } -template -IColumn::Filter dictionaryJoinRightColumns(const TableJoin & table_join, AddedColumns & added_columns) -{ - if constexpr (KIND == JoinKind::Left && - (STRICTNESS == JoinStrictness::Any || - STRICTNESS == JoinStrictness::Semi || - STRICTNESS == JoinStrictness::Anti)) - { - assert(added_columns.join_on_keys.size() == 1); - - std::vector maps_vector; - maps_vector.push_back(&table_join); - - JoinStuff::JoinUsedFlags flags; - std::vector key_getter_vector; - key_getter_vector.push_back(KeyGetterForDict(table_join, added_columns.join_on_keys[0].key_columns)); - return joinRightColumnsSwitchNullability(std::move(key_getter_vector), maps_vector, added_columns, flags); - } - - throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong JOIN combination: {} {}", STRICTNESS, KIND); -} - } /// nameless template @@ -1514,9 +1441,7 @@ void HashJoin::joinBlockImpl( bool has_required_right_keys = (required_right_keys.columns() != 0); added_columns.need_filter = jf.need_filter || has_required_right_keys; - IColumn::Filter row_filter = overDictionary() ? - dictionaryJoinRightColumns(*table_join, added_columns) : - switchJoinRightColumns(maps_, added_columns, data->type, used_flags); + IColumn::Filter row_filter = switchJoinRightColumns(maps_, added_columns, data->type, used_flags); for (size_t i = 0; i < added_columns.size(); ++i) block.insert(added_columns.moveColumn(i)); @@ -1772,39 +1697,7 @@ void HashJoin::joinBlock(Block & block, ExtraBlockPtr & not_processed) materializeBlockInplace(block); } - if (overDictionary()) { - auto & map = std::get(data->maps[0]); - std::vector*> maps_vector; - maps_vector.push_back(&map); - - if (kind == JoinKind::Left) - { - switch (strictness) - { - case JoinStrictness::Any: - case JoinStrictness::All: - joinBlockImpl(block, sample_block_with_columns_to_add, maps_vector); - break; - case JoinStrictness::Semi: - joinBlockImpl(block, sample_block_with_columns_to_add, maps_vector); - break; - case JoinStrictness::Anti: - joinBlockImpl(block, sample_block_with_columns_to_add, maps_vector); - break; - default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong JOIN combination: dictionary + {} {}", strictness, kind); - } - } - else if (kind == JoinKind::Inner && strictness == JoinStrictness::All) - joinBlockImpl(block, sample_block_with_columns_to_add, maps_vector); - - else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong JOIN combination: {} {}", strictness, kind); - } - else - { - std::vectormaps[0])> * > maps_vector; for (size_t i = 0; i < table_join->getClauses().size(); ++i) maps_vector.push_back(&data->maps[i]); diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index 54c641627c0..f12997fd1c9 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -33,7 +33,6 @@ namespace DB { class TableJoin; -class DictionaryReader; namespace JoinStuff { @@ -171,13 +170,12 @@ public: /// Used by joinGet function that turns StorageJoin into a dictionary. ColumnWithTypeAndName joinGet(const Block & block, const Block & block_with_columns_to_add) const; - bool isFilled() const override { return from_storage_join || data->type == Type::DICT; } + bool isFilled() const override { return from_storage_join; } JoinPipelineType pipelineType() const override { - /// No need to process anything in the right stream if it's a dictionary will just join the left stream with it. - bool is_filled = from_storage_join || data->type == Type::DICT; - if (is_filled) + /// No need to process anything in the right stream if hash table was already filled + if (from_storage_join) return JoinPipelineType::FilledRight; /// Default pipeline processes right stream at first and then left. @@ -233,7 +231,6 @@ public: { EMPTY, CROSS, - DICT, #define M(NAME) NAME, APPLY_FOR_JOIN_VARIANTS(M) #undef M @@ -261,7 +258,6 @@ public: { case Type::EMPTY: break; case Type::CROSS: break; - case Type::DICT: break; #define M(NAME) \ case Type::NAME: NAME = std::make_unique(); break; @@ -276,7 +272,6 @@ public: { case Type::EMPTY: return 0; case Type::CROSS: return 0; - case Type::DICT: return 0; #define M(NAME) \ case Type::NAME: return NAME ? NAME->size() : 0; @@ -293,7 +288,6 @@ public: { case Type::EMPTY: return 0; case Type::CROSS: return 0; - case Type::DICT: return 0; #define M(NAME) \ case Type::NAME: return NAME ? NAME->getBufferSizeInBytes() : 0; @@ -310,7 +304,6 @@ public: { case Type::EMPTY: return 0; case Type::CROSS: return 0; - case Type::DICT: return 0; #define M(NAME) \ case Type::NAME: return NAME ? NAME->getBufferSizeInCells() : 0; @@ -424,7 +417,6 @@ private: static Type chooseMethod(JoinKind kind, const ColumnRawPtrs & key_columns, Sizes & key_sizes); bool empty() const; - bool overDictionary() const; }; } diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp index 9caaad7faa7..7a6f624ae47 100644 --- a/src/Interpreters/JoinedTables.cpp +++ b/src/Interpreters/JoinedTables.cpp @@ -324,7 +324,6 @@ std::shared_ptr JoinedTables::makeTableJoin(const ASTSelectQuery & se storage_dict && join_algorithm.isSet(JoinAlgorithm::DIRECT)) { table_join->setRightStorageName(storage_dict->getDictionaryName()); - table_join->setStorageJoin(storage_dict); } else if (auto storage_kv = std::dynamic_pointer_cast(storage); storage_kv && join_algorithm.isSet(JoinAlgorithm::DIRECT)) diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 53eeff49437..fccd8430f1c 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -13,7 +13,6 @@ #include -#include #include #include @@ -722,8 +721,6 @@ void TableJoin::setStorageJoin(std::shared_ptr storage) void TableJoin::setStorageJoin(std::shared_ptr storage) { - if (right_storage_dictionary) - throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "StorageJoin and Dictionary join are mutually exclusive"); right_storage_join = storage; } @@ -737,13 +734,6 @@ const std::string & TableJoin::getRightStorageName() const return right_storage_name; } -void TableJoin::setStorageJoin(std::shared_ptr storage) -{ - if (right_storage_join) - throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "StorageJoin and Dictionary join are mutually exclusive"); - right_storage_dictionary = storage; -} - String TableJoin::renamedRightColumnName(const String & name) const { if (const auto it = renames.find(name); it != renames.end()) @@ -828,7 +818,7 @@ void TableJoin::resetToCross() bool TableJoin::allowParallelHashJoin() const { - if (dictionary_reader || !join_algorithm.isSet(JoinAlgorithm::PARALLEL_HASH)) + if (!right_storage_name.empty() || !join_algorithm.isSet(JoinAlgorithm::PARALLEL_HASH)) return false; if (table_join.kind != JoinKind::Left && table_join.kind != JoinKind::Inner) return false; diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 4e0706712a4..d7463f1f4b5 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -28,7 +28,6 @@ class Context; class ASTSelectQuery; struct DatabaseAndTableWithAlias; class Block; -class DictionaryReader; class DictionaryJoinAdapter; class StorageJoin; class StorageDictionary; @@ -141,9 +140,6 @@ private: std::shared_ptr right_storage_join; - std::shared_ptr right_storage_dictionary; - std::shared_ptr dictionary_reader; - std::shared_ptr right_kv_storage; std::string right_storage_name; @@ -310,14 +306,10 @@ public: void setStorageJoin(std::shared_ptr storage); void setStorageJoin(std::shared_ptr storage); - void setStorageJoin(std::shared_ptr storage); std::shared_ptr getStorageJoin() { return right_storage_join; } - bool tryInitDictJoin(const Block & sample_block, ContextPtr context); - - bool isSpecialStorage() const { return right_storage_dictionary || right_storage_join || right_kv_storage; } - const DictionaryReader * getDictionaryReader() const { return dictionary_reader.get(); } + bool isSpecialStorage() const { return !right_storage_name.empty() || right_storage_join || right_kv_storage; } std::shared_ptr getStorageKeyValue() { return right_kv_storage; } }; From 442ffb0349cafdcfa1f6d43a1ac501ada173d3df Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 4 Aug 2022 15:15:49 +0000 Subject: [PATCH 113/164] Fix build after resolving conflicts --- src/Interpreters/DirectJoin.cpp | 22 ++++---- src/Interpreters/ExpressionAnalyzer.cpp | 27 +++++----- src/Interpreters/HashJoin.cpp | 4 +- src/Interpreters/TableJoin.cpp | 69 ------------------------- 4 files changed, 27 insertions(+), 95 deletions(-) diff --git a/src/Interpreters/DirectJoin.cpp b/src/Interpreters/DirectJoin.cpp index 46ed45f7708..d47a4f7a305 100644 --- a/src/Interpreters/DirectJoin.cpp +++ b/src/Interpreters/DirectJoin.cpp @@ -69,21 +69,21 @@ DirectKeyValueJoin::DirectKeyValueJoin(std::shared_ptr table_join_, , right_sample_block(right_sample_block_) , log(&Poco::Logger::get("DirectKeyValueJoin")) { - if (!table_join->oneDisjunct() - || table_join->getOnlyClause().key_names_left.size() != 1 - || table_join->getOnlyClause().key_names_right.size() != 1) + if (!table_join->oneDisjunct() || + table_join->getOnlyClause().key_names_left.size() != 1 || + table_join->getOnlyClause().key_names_right.size() != 1) { throw DB::Exception(ErrorCodes::UNSUPPORTED_JOIN_KEYS, "Not supported by direct JOIN"); } - bool allowed_inner = isInner(table_join->kind()) && (table_join->strictness() == ASTTableJoin::Strictness::All || - table_join->strictness() == ASTTableJoin::Strictness::Any || + bool allowed_inner = isInner(table_join->kind()) && (table_join->strictness() == JoinStrictness::All || + table_join->strictness() == JoinStrictness::Any || table_join->strictness() != JoinStrictness::RightAny); - bool allowed_left = isLeft(table_join->kind()) && (table_join->strictness() == ASTTableJoin::Strictness::Any || - table_join->strictness() == ASTTableJoin::Strictness::All || - table_join->strictness() == ASTTableJoin::Strictness::Semi || - table_join->strictness() == ASTTableJoin::Strictness::Anti); + bool allowed_left = isLeft(table_join->kind()) && (table_join->strictness() == JoinStrictness::Any || + table_join->strictness() == JoinStrictness::All || + table_join->strictness() == JoinStrictness::Semi || + table_join->strictness() == JoinStrictness::Anti); if (!allowed_inner && !allowed_left) { throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "Strictness {} and kind {} is not supported by direct JOIN", @@ -128,8 +128,8 @@ void DirectKeyValueJoin::joinBlock(Block & block, std::shared_ptr &) block.insert(std::move(col)); } - bool is_semi_join = table_join->strictness() == ASTTableJoin::Strictness::Semi; - bool is_anti_join = table_join->strictness() == ASTTableJoin::Strictness::Anti; + bool is_semi_join = table_join->strictness() == JoinStrictness::Semi; + bool is_anti_join = table_join->strictness() == JoinStrictness::Anti; if (is_anti_join) { diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index b466fbbfcc1..18fcef87d3c 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -113,6 +113,8 @@ bool allowEarlyConstantFolding(const ActionsDAG & actions, const Settings & sett return true; } +Poco::Logger * getLogger() { return &Poco::Logger::get("ExpressionAnalyzer"); } + } bool sanitizeBlock(Block & block, bool throw_if_cannot_create_column) @@ -1006,6 +1008,7 @@ static ActionsDAGPtr createJoinedBlockActions(ContextPtr context, const TableJoi return ExpressionAnalyzer(expression_list, syntax_result, context).getActionsDAG(true, false); } +std::shared_ptr tryDictJoin(std::shared_ptr analyzed_join, const Block & right_sample_block, ContextPtr context); std::shared_ptr tryKeyValueJoin(std::shared_ptr analyzed_join, const Block & right_sample_block); static std::shared_ptr chooseJoinAlgorithm(std::shared_ptr analyzed_join, std::unique_ptr & joined_plan, ContextPtr context) @@ -1016,7 +1019,7 @@ static std::shared_ptr chooseJoinAlgorithm(std::shared_ptr ana { JoinPtr direct_join = nullptr; direct_join = direct_join ? direct_join : tryKeyValueJoin(analyzed_join, right_sample_block); - direct_join = direct_join ? direct_join : tryDictJoin(analyzed_join, right_sample_block, getContext()); + direct_join = direct_join ? direct_join : tryDictJoin(analyzed_join, right_sample_block, context); if (direct_join) { @@ -1111,23 +1114,21 @@ static std::unique_ptr buildJoinedPlan( std::shared_ptr tryDictJoin(std::shared_ptr analyzed_join, const Block & right_sample_block, ContextPtr context) { - using Strictness = ASTTableJoin::Strictness; - - bool allowed_inner = isInner(analyzed_join->kind()) && analyzed_join->strictness() == Strictness::All; - bool allowed_left = isLeft(analyzed_join->kind()) && (analyzed_join->strictness() == Strictness::Any || - analyzed_join->strictness() == Strictness::All || - analyzed_join->strictness() == Strictness::Semi || - analyzed_join->strictness() == Strictness::Anti); + bool allowed_inner = isInner(analyzed_join->kind()) && analyzed_join->strictness() == JoinStrictness::All; + bool allowed_left = isLeft(analyzed_join->kind()) && (analyzed_join->strictness() == JoinStrictness::Any || + analyzed_join->strictness() == JoinStrictness::All || + analyzed_join->strictness() == JoinStrictness::Semi || + analyzed_join->strictness() == JoinStrictness::Anti); if (!allowed_inner && !allowed_left) { - LOG_TRACE(&Poco::Logger::get("tryDictJoin"), "Can't use dictionary join: {} {} is not supported", + LOG_TRACE(getLogger(), "Can't use dictionary join: {} {} is not supported", analyzed_join->kind(), analyzed_join->strictness()); return nullptr; } if (analyzed_join->getClauses().size() != 1 || analyzed_join->getClauses()[0].key_names_right.size() != 1) { - LOG_TRACE(&Poco::Logger::get("tryDictJoin"), "Can't use dictionary join: only one key is supported"); + LOG_TRACE(getLogger(), "Can't use dictionary join: only one key is supported"); return nullptr; } @@ -1136,7 +1137,7 @@ std::shared_ptr tryDictJoin(std::shared_ptr analy const auto & dictionary_name = analyzed_join->getRightStorageName(); if (dictionary_name.empty()) { - LOG_TRACE(&Poco::Logger::get("tryDictJoin"), "Can't use dictionary join: dictionary was not found"); + LOG_TRACE(getLogger(), "Can't use dictionary join: dictionary was not found"); return nullptr; } @@ -1145,14 +1146,14 @@ std::shared_ptr tryDictJoin(std::shared_ptr analy auto dictionary = dictionary_helper.getDictionary(dictionary_name); if (!dictionary) { - LOG_TRACE(&Poco::Logger::get("tryDictJoin"), "Can't use dictionary join: dictionary was not found"); + LOG_TRACE(getLogger(), "Can't use dictionary join: dictionary was not found"); return nullptr; } const auto & dict_keys = dictionary->getStructure().getKeysNames(); if (dict_keys.size() != 1 || dict_keys[0] != analyzed_join->getOriginalName(right_key)) { - LOG_TRACE(&Poco::Logger::get("tryDictJoin"), "Can't use dictionary join: join key '{}' doesn't natch to dictionary key ({})", + LOG_TRACE(getLogger(), "Can't use dictionary join: join key '{}' doesn't natch to dictionary key ({})", right_key, fmt::join(dict_keys, ", ")); return nullptr; } diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index a948769540a..1b9c6e72c07 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -266,7 +266,7 @@ HashJoin::HashJoin(std::shared_ptr table_join_, const Block & right_s const auto & key_names_right = clause.key_names_right; ColumnRawPtrs key_columns = JoinCommon::extractKeysForJoin(right_table_keys, key_names_right); - if (strictness == ASTTableJoin::Strictness::Asof) + if (strictness == JoinStrictness::Asof) { assert(disjuncts_num == 1); @@ -459,7 +459,7 @@ struct KeyGetterForType void HashJoin::dataMapInit(MapsVariant & map) { - if (kind == ASTTableJoin::Kind::Cross) + if (kind == JoinKind::Cross) return; joinDispatchInit(kind, strictness, map); joinDispatch(kind, strictness, map, [&](auto, auto, auto & map_) { map_.create(data->type); }); diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index fccd8430f1c..ce42af644b1 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -416,75 +416,6 @@ bool TableJoin::needStreamWithNonJoinedRows() const return isRightOrFull(kind()); } -static std::optional getDictKeyName(const String & dict_name , ContextPtr context) -{ - auto dictionary = context->getExternalDictionariesLoader().getDictionary(dict_name, context); - if (!dictionary) - return {}; - - if (const auto & structure = dictionary->getStructure(); structure.id) - return structure.id->name; - return {}; -} - -bool TableJoin::tryInitDictJoin(const Block & sample_block, ContextPtr context) -{ - bool allowed_inner = isInner(kind()) && strictness() == JoinStrictness::All; - bool allowed_left = isLeft(kind()) && (strictness() == JoinStrictness::Any || - strictness() == JoinStrictness::All || - strictness() == JoinStrictness::Semi || - strictness() == JoinStrictness::Anti); - - /// Support ALL INNER, [ANY | ALL | SEMI | ANTI] LEFT - if (!allowed_inner && !allowed_left) - return false; - - if (clauses.size() != 1 || clauses[0].key_names_right.size() != 1) - return false; - - const auto & right_key = getOnlyClause().key_names_right[0]; - - /// TODO: support 'JOIN ... ON expr(dict_key) = table_key' - auto it_key = original_names.find(right_key); - if (it_key == original_names.end()) - return false; - - if (!right_storage_dictionary) - return false; - - auto dict_name = right_storage_dictionary->getDictionaryName(); - - auto dict_key = getDictKeyName(dict_name, context); - if (!dict_key.has_value() || *dict_key != it_key->second) - return false; /// JOIN key != Dictionary key - - Names src_names; - NamesAndTypesList dst_columns; - for (const auto & col : sample_block) - { - if (col.name == right_key) - continue; /// do not extract key column - - auto it = original_names.find(col.name); - if (it != original_names.end()) - { - String original = it->second; - src_names.push_back(original); - dst_columns.push_back({col.name, col.type}); - } - else - { - /// Can't extract column from dictionary table - /// TODO: Sometimes it should be possible to recunstruct required column, - /// e.g. if it's an expression depending on dictionary attributes - return false; - } - } - dictionary_reader = std::make_shared(dict_name, src_names, dst_columns, context); - - return true; -} - static void renameIfNeeded(String & name, const NameToNameMap & renames) { if (const auto it = renames.find(name); it != renames.end()) From 91fe8f0b46a2b9460023709e45a70e0fe3d11c32 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 4 Aug 2022 15:16:27 +0000 Subject: [PATCH 114/164] Force dict join in 01115_join_with_dictionary --- .../01115_join_with_dictionary.reference | 22 +++++++++--------- .../01115_join_with_dictionary.sql | 23 ++++++++++++++----- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/tests/queries/0_stateless/01115_join_with_dictionary.reference b/tests/queries/0_stateless/01115_join_with_dictionary.reference index 326f0c5e14b..612c3333160 100644 --- a/tests/queries/0_stateless/01115_join_with_dictionary.reference +++ b/tests/queries/0_stateless/01115_join_with_dictionary.reference @@ -16,12 +16,6 @@ flat: any left 2 2 2 2 3 3 3 3 4 0 0 -flat: any left + any_join_distinct_right_table_keys -0 0 0 0 -1 1 1 1 -2 2 2 2 -3 3 3 3 -4 0 0 flat: semi left 0 0 0 0 1 1 1 1 @@ -37,11 +31,6 @@ flat: inner on 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 -flat: inner or -0 0 0 0 0 -1000 1 1 1 1 -2 2 2 2 2 -3000 3 3 3 3 hashed: left on 0 0 0 0 0 1 1 1 1 1 @@ -75,6 +64,17 @@ hashed: inner on 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 +flat: inner or +0 0 0 0 0 +1000 1 1 1 1 +2 2 2 2 2 +3000 3 3 3 3 +flat: any left + any_join_distinct_right_table_keys +0 0 0 0 +1 1 1 1 +2 2 2 2 +3 3 3 3 +4 0 0 complex_cache (smoke) 0 \N \N \N \N 1 \N \N \N \N diff --git a/tests/queries/0_stateless/01115_join_with_dictionary.sql b/tests/queries/0_stateless/01115_join_with_dictionary.sql index 2b38abdec0e..cde1385eaae 100644 --- a/tests/queries/0_stateless/01115_join_with_dictionary.sql +++ b/tests/queries/0_stateless/01115_join_with_dictionary.sql @@ -1,5 +1,3 @@ -SET send_logs_level = 'fatal'; - DROP TABLE IF EXISTS t1; DROP DICTIONARY IF EXISTS dict_flat; @@ -29,14 +27,15 @@ LAYOUT(COMPLEX_KEY_CACHE(SIZE_IN_CELLS 1)); SET join_use_nulls = 0; +SET join_algorithm = 'direct'; + SELECT 'flat: left on'; SELECT * FROM (SELECT number AS key FROM numbers(5)) s1 LEFT JOIN dict_flat d ON s1.key = d.key ORDER BY s1.key; SELECT 'flat: left'; SELECT * FROM (SELECT number AS key FROM numbers(5)) s1 LEFT JOIN dict_flat d USING(key) ORDER BY key; SELECT 'flat: any left'; SELECT * FROM (SELECT number AS key FROM numbers(5)) s1 ANY LEFT JOIN dict_flat d USING(key) ORDER BY key; -SELECT 'flat: any left + any_join_distinct_right_table_keys'; -- falls back to regular join -SELECT * FROM (SELECT number AS key FROM numbers(5)) s1 ANY LEFT JOIN dict_flat d USING(key) ORDER BY key SETTINGS any_join_distinct_right_table_keys = '1'; + SELECT 'flat: semi left'; SELECT * FROM (SELECT number AS key FROM numbers(5)) s1 SEMI JOIN dict_flat d USING(key) ORDER BY key; SELECT 'flat: anti left'; @@ -45,8 +44,6 @@ SELECT 'flat: inner'; SELECT * FROM (SELECT number AS key FROM numbers(2)) s1 JOIN dict_flat d USING(key); SELECT 'flat: inner on'; SELECT * FROM (SELECT number AS k FROM numbers(100)) s1 JOIN dict_flat d ON k = key ORDER BY k; -SELECT 'flat: inner or'; -- it's not a join over dictionary, because it doen't suppoert multiple keys, but of falls back to regular join -SELECT * FROM (SELECT if(number % 2 = 0, number, number * 1000) AS k FROM numbers(100)) s1 JOIN dict_flat d ON k = key OR k == 1000 * key ORDER BY key; SET join_use_nulls = 1; @@ -65,6 +62,20 @@ SELECT * FROM (SELECT number AS key FROM numbers(2)) s1 JOIN dict_hashed d USING SELECT 'hashed: inner on'; SELECT * FROM (SELECT number AS k FROM numbers(100)) s1 JOIN dict_hashed d ON k = key ORDER BY k; +SET join_use_nulls = 0; + +-- unsupported cases for dictionary join, falls back to regular join + +SET join_algorithm = 'default'; + +SELECT 'flat: inner or'; +SELECT * FROM (SELECT if(number % 2 = 0, number, number * 1000) AS k FROM numbers(100)) s1 JOIN dict_flat d ON k = key OR k == 1000 * key ORDER BY key; + +SELECT 'flat: any left + any_join_distinct_right_table_keys'; +SELECT * FROM (SELECT number AS key FROM numbers(5)) s1 ANY LEFT JOIN dict_flat d USING(key) ORDER BY key SETTINGS any_join_distinct_right_table_keys = '1'; + +SET join_use_nulls = 1; + SELECT 'complex_cache (smoke)'; SELECT * FROM (SELECT number AS key FROM numbers(5)) s1 LEFT JOIN dict_complex_cache d ON s1.key = d.key ORDER BY s1.key; From f7b130b0cbe8f7915a54df849cd5b87f70a0c6e3 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 4 Aug 2022 15:18:09 +0000 Subject: [PATCH 115/164] Rename IKVStorage.h -> IKeyValueStorage.h --- src/Interpreters/DictionaryJoinAdapter.h | 2 +- src/Interpreters/DirectJoin.h | 2 +- src/Interpreters/HashJoin.h | 2 +- src/Interpreters/TableJoin.h | 2 +- src/Storages/{IKVStorage.h => IKeyValueStorage.h} | 0 src/Storages/RocksDB/StorageEmbeddedRocksDB.h | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) rename src/Storages/{IKVStorage.h => IKeyValueStorage.h} (100%) diff --git a/src/Interpreters/DictionaryJoinAdapter.h b/src/Interpreters/DictionaryJoinAdapter.h index adfa75a996b..13695a378fa 100644 --- a/src/Interpreters/DictionaryJoinAdapter.h +++ b/src/Interpreters/DictionaryJoinAdapter.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Interpreters/DirectJoin.h b/src/Interpreters/DirectJoin.h index f7da06ef826..558cce124e9 100644 --- a/src/Interpreters/DirectJoin.h +++ b/src/Interpreters/DirectJoin.h @@ -9,7 +9,7 @@ #include -#include +#include #include namespace DB diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index f12997fd1c9..df448e015c1 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -27,7 +27,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index d7463f1f4b5..a05faae1609 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Storages/IKVStorage.h b/src/Storages/IKeyValueStorage.h similarity index 100% rename from src/Storages/IKVStorage.h rename to src/Storages/IKeyValueStorage.h diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h index 62c9a0eeae7..038788f1710 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include From ad91c16ba0f4cb30b21f669b5fffa08de77ba3d9 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 4 Aug 2022 15:20:19 +0000 Subject: [PATCH 116/164] Rename join_common -> JoinUtils --- src/Interpreters/DictionaryJoinAdapter.cpp | 2 +- src/Interpreters/DirectJoin.cpp | 2 +- src/Interpreters/ExpressionAnalyzer.cpp | 2 +- src/Interpreters/ExpressionAnalyzer.h | 2 +- src/Interpreters/HashJoin.cpp | 2 +- src/Interpreters/JoinSwitcher.cpp | 2 +- src/Interpreters/{join_common.cpp => JoinUtils.cpp} | 2 +- src/Interpreters/{join_common.h => JoinUtils.h} | 0 src/Interpreters/MergeJoin.cpp | 2 +- src/Interpreters/TableJoin.h | 2 +- src/Processors/Transforms/JoiningTransform.cpp | 2 +- src/Storages/StorageJoin.cpp | 2 +- src/Storages/StorageJoin.h | 2 +- 13 files changed, 12 insertions(+), 12 deletions(-) rename src/Interpreters/{join_common.cpp => JoinUtils.cpp} (99%) rename src/Interpreters/{join_common.h => JoinUtils.h} (100%) diff --git a/src/Interpreters/DictionaryJoinAdapter.cpp b/src/Interpreters/DictionaryJoinAdapter.cpp index e3c7deb3963..fdcb7f1b46e 100644 --- a/src/Interpreters/DictionaryJoinAdapter.cpp +++ b/src/Interpreters/DictionaryJoinAdapter.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/src/Interpreters/DirectJoin.cpp b/src/Interpreters/DirectJoin.cpp index d47a4f7a305..cf72aa9279b 100644 --- a/src/Interpreters/DirectJoin.cpp +++ b/src/Interpreters/DirectJoin.cpp @@ -30,7 +30,7 @@ static MutableColumns convertBlockStructure( MutableColumns result_columns; for (const auto & out_sample_col : result_sample_block) { - /// Some coulumns from result_sample_block may not be in source_sample_block, + /// Some columns from result_sample_block may not be in source_sample_block, /// e.g. if they will be calculated later based on joined columns if (!source_sample_block.has(out_sample_col.name)) continue; diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 18fcef87d3c..80f8fda252c 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -64,7 +64,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index da92bc10832..6eed349cda8 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 1b9c6e72c07..e559977be49 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/JoinSwitcher.cpp b/src/Interpreters/JoinSwitcher.cpp index 34c8bb4cfd5..5d5a9b27825 100644 --- a/src/Interpreters/JoinSwitcher.cpp +++ b/src/Interpreters/JoinSwitcher.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Interpreters/join_common.cpp b/src/Interpreters/JoinUtils.cpp similarity index 99% rename from src/Interpreters/join_common.cpp rename to src/Interpreters/JoinUtils.cpp index ba71445df29..59e2475a9b2 100644 --- a/src/Interpreters/join_common.cpp +++ b/src/Interpreters/JoinUtils.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/src/Interpreters/join_common.h b/src/Interpreters/JoinUtils.h similarity index 100% rename from src/Interpreters/join_common.h rename to src/Interpreters/JoinUtils.h diff --git a/src/Interpreters/MergeJoin.cpp b/src/Interpreters/MergeJoin.cpp index 655d50355f9..f1dcff70c4c 100644 --- a/src/Interpreters/MergeJoin.cpp +++ b/src/Interpreters/MergeJoin.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index a05faae1609..d473a42901a 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Processors/Transforms/JoiningTransform.cpp b/src/Processors/Transforms/JoiningTransform.cpp index 64343946ff4..fed28a11ad5 100644 --- a/src/Processors/Transforms/JoiningTransform.cpp +++ b/src/Processors/Transforms/JoiningTransform.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp index c94fd3b2256..2e3e1d443ae 100644 --- a/src/Storages/StorageJoin.cpp +++ b/src/Storages/StorageJoin.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Storages/StorageJoin.h b/src/Storages/StorageJoin.h index a75fe944a34..2a28ff6d01b 100644 --- a/src/Storages/StorageJoin.h +++ b/src/Storages/StorageJoin.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace DB From 44c688332a5125c1f11d2a347e68c71fb7acf19c Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 4 Aug 2022 15:39:28 +0000 Subject: [PATCH 117/164] IKeyValueEntity is not inheritor of IStorage --- src/Interpreters/DictionaryJoinAdapter.cpp | 17 +++++++++-------- src/Interpreters/DictionaryJoinAdapter.h | 14 ++++++-------- src/Interpreters/DirectJoin.cpp | 4 ++-- src/Interpreters/DirectJoin.h | 6 +++--- src/Interpreters/HashJoin.h | 2 +- .../IKeyValueEntity.h} | 12 +++++++----- src/Interpreters/JoinedTables.cpp | 2 +- src/Interpreters/TableJoin.cpp | 2 +- src/Interpreters/TableJoin.h | 10 +++++----- src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp | 9 ++++++++- src/Storages/RocksDB/StorageEmbeddedRocksDB.h | 6 ++++-- 11 files changed, 47 insertions(+), 37 deletions(-) rename src/{Storages/IKeyValueStorage.h => Interpreters/IKeyValueEntity.h} (80%) diff --git a/src/Interpreters/DictionaryJoinAdapter.cpp b/src/Interpreters/DictionaryJoinAdapter.cpp index fdcb7f1b46e..d55c2f3bad5 100644 --- a/src/Interpreters/DictionaryJoinAdapter.cpp +++ b/src/Interpreters/DictionaryJoinAdapter.cpp @@ -19,7 +19,7 @@ namespace ErrorCodes DictionaryJoinAdapter::DictionaryJoinAdapter( std::shared_ptr dictionary_, const Names & result_column_names) - : IKeyValueStorage(StorageID::createEmpty()) + : IKeyValueEntity() , dictionary(dictionary_) { if (!dictionary) @@ -30,24 +30,20 @@ DictionaryJoinAdapter::DictionaryJoinAdapter( if (key_types.size() != key_names.size()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary '{}' has invalid structure", dictionary->getFullName()); - StorageInMemoryMetadata storage_metadata; - for (size_t i = 0; i < key_types.size(); ++i) { - storage_metadata.columns.add(ColumnDescription(key_names[i], key_types[i])); + sample_block.insert(ColumnWithTypeAndName(nullptr, key_types[i], key_names[i])); } for (const auto & attr_name : result_column_names) { const auto & attr = dictionary->getStructure().getAttribute(attr_name); - storage_metadata.columns.add(ColumnDescription(attr_name, attr.type)); + + sample_block.insert(ColumnWithTypeAndName(nullptr, attr.type, attr_name)); attribute_names.emplace_back(attr_name); result_types.emplace_back(attr.type); } - - /// Fill in memory metadata to make getSampleBlock work. - setInMemoryMetadata(storage_metadata); } Names DictionaryJoinAdapter::getPrimaryKey() const @@ -55,6 +51,11 @@ Names DictionaryJoinAdapter::getPrimaryKey() const return dictionary->getStructure().getKeysNames(); } +Block DictionaryJoinAdapter::getSampleBlock() const +{ + return sample_block; +} + Chunk DictionaryJoinAdapter::getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map) const { if (keys.empty()) diff --git a/src/Interpreters/DictionaryJoinAdapter.h b/src/Interpreters/DictionaryJoinAdapter.h index 13695a378fa..888c1485831 100644 --- a/src/Interpreters/DictionaryJoinAdapter.h +++ b/src/Interpreters/DictionaryJoinAdapter.h @@ -3,30 +3,28 @@ #include #include #include -#include +#include namespace DB { /// Used in join with dictionary to provide sufficient interface to DirectJoin -class DictionaryJoinAdapter : public IKeyValueStorage +class DictionaryJoinAdapter : public IKeyValueEntity { public: - DictionaryJoinAdapter( - std::shared_ptr dictionary_, const Names & result_column_names); + DictionaryJoinAdapter(std::shared_ptr dictionary_, const Names & result_column_names); Names getPrimaryKey() const override; Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map) const override; - std::string getName() const override - { - return dictionary->getFullName(); - } + Block getSampleBlock() const override; private: std::shared_ptr dictionary; + Block sample_block; + Strings attribute_names; DataTypes result_types; }; diff --git a/src/Interpreters/DirectJoin.cpp b/src/Interpreters/DirectJoin.cpp index cf72aa9279b..2bae1afd7a5 100644 --- a/src/Interpreters/DirectJoin.cpp +++ b/src/Interpreters/DirectJoin.cpp @@ -63,7 +63,7 @@ static MutableColumns convertBlockStructure( DirectKeyValueJoin::DirectKeyValueJoin(std::shared_ptr table_join_, const Block & right_sample_block_, - std::shared_ptr storage_) + std::shared_ptr storage_) : table_join(table_join_) , storage(storage_) , right_sample_block(right_sample_block_) @@ -118,7 +118,7 @@ void DirectKeyValueJoin::joinBlock(Block & block, std::shared_ptr &) /// Expected right block may differ from structure in storage, because of `join_use_nulls` or we just select not all joined attributes Block original_right_block = originalRightBlock(right_sample_block, *table_join); - Block sample_storage_block = storage->getInMemoryMetadataPtr()->getSampleBlock(); + Block sample_storage_block = storage->getSampleBlock(); MutableColumns result_columns = convertBlockStructure(sample_storage_block, original_right_block, joined_chunk.mutateColumns(), null_map); for (size_t i = 0; i < result_columns.size(); ++i) diff --git a/src/Interpreters/DirectJoin.h b/src/Interpreters/DirectJoin.h index 558cce124e9..9aeca9b3a16 100644 --- a/src/Interpreters/DirectJoin.h +++ b/src/Interpreters/DirectJoin.h @@ -9,7 +9,7 @@ #include -#include +#include #include namespace DB @@ -23,7 +23,7 @@ public: DirectKeyValueJoin( std::shared_ptr table_join_, const Block & right_sample_block_, - std::shared_ptr storage_); + std::shared_ptr storage_); virtual const TableJoin & getTableJoin() const override { return *table_join; } @@ -50,7 +50,7 @@ public: private: std::shared_ptr table_join; - std::shared_ptr storage; + std::shared_ptr storage; Block right_sample_block; Block sample_block_with_columns_to_add; Poco::Logger * log; diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index df448e015c1..33955333aa2 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -27,7 +27,7 @@ #include #include -#include +#include namespace DB { diff --git a/src/Storages/IKeyValueStorage.h b/src/Interpreters/IKeyValueEntity.h similarity index 80% rename from src/Storages/IKeyValueStorage.h rename to src/Interpreters/IKeyValueEntity.h index 667ccda0c41..c1de1a2a9c5 100644 --- a/src/Storages/IKeyValueStorage.h +++ b/src/Interpreters/IKeyValueEntity.h @@ -1,18 +1,15 @@ #pragma once #include -#include #include namespace DB { -/// Storage that support key-value requests -class IKeyValueStorage : public IStorage +/// Interface for entities with key-value semantics. +class IKeyValueEntity { public: - using IStorage::IStorage; - /// Get primary key name that supports key-value requests. /// Primary key can constist of multiple columns. virtual Names getPrimaryKey() const = 0; @@ -28,6 +25,11 @@ public: * If the key was not found row would have a default value. */ virtual Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map) const = 0; + + /// Header for getByKeys result + virtual Block getSampleBlock() const = 0; + + virtual ~IKeyValueEntity() = default; }; } diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp index 7a6f624ae47..c365239e6d8 100644 --- a/src/Interpreters/JoinedTables.cpp +++ b/src/Interpreters/JoinedTables.cpp @@ -325,7 +325,7 @@ std::shared_ptr JoinedTables::makeTableJoin(const ASTSelectQuery & se { table_join->setRightStorageName(storage_dict->getDictionaryName()); } - else if (auto storage_kv = std::dynamic_pointer_cast(storage); + else if (auto storage_kv = std::dynamic_pointer_cast(storage); storage_kv && join_algorithm.isSet(JoinAlgorithm::DIRECT)) { table_join->setStorageJoin(storage_kv); diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index ce42af644b1..67f66fb9694 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -645,7 +645,7 @@ ActionsDAGPtr TableJoin::applyKeyConvertToTable( return dag_stage1; } -void TableJoin::setStorageJoin(std::shared_ptr storage) +void TableJoin::setStorageJoin(std::shared_ptr storage) { right_kv_storage = storage; } diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index d473a42901a..1866ad2a5fd 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include @@ -31,7 +31,7 @@ class Block; class DictionaryJoinAdapter; class StorageJoin; class StorageDictionary; -class IKeyValueStorage; +class IKeyValueEntity; struct ColumnWithTypeAndName; using ColumnsWithTypeAndName = std::vector; @@ -140,7 +140,7 @@ private: std::shared_ptr right_storage_join; - std::shared_ptr right_kv_storage; + std::shared_ptr right_kv_storage; std::string right_storage_name; @@ -304,14 +304,14 @@ public: void setRightStorageName(const std::string & storage_name); const std::string & getRightStorageName() const; - void setStorageJoin(std::shared_ptr storage); + void setStorageJoin(std::shared_ptr storage); void setStorageJoin(std::shared_ptr storage); std::shared_ptr getStorageJoin() { return right_storage_join; } bool isSpecialStorage() const { return !right_storage_name.empty() || right_storage_join || right_kv_storage; } - std::shared_ptr getStorageKeyValue() { return right_kv_storage; } + std::shared_ptr getStorageKeyValue() { return right_kv_storage; } }; } diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index 2774c52fe7c..90cfbf910dd 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -349,7 +349,8 @@ StorageEmbeddedRocksDB::StorageEmbeddedRocksDB(const StorageID & table_id_, bool attach, ContextPtr context_, const String & primary_key_) - : IKeyValueStorage(table_id_) + : IStorage(table_id_) + , IKeyValueEntity() , WithContext(context_->getGlobalContext()) , primary_key{primary_key_} { @@ -571,6 +572,12 @@ Chunk StorageEmbeddedRocksDB::getByKeys( return getBySerializedKeys(raw_keys, &null_map); } +Block StorageEmbeddedRocksDB::getSampleBlock() const +{ + auto metadata = getInMemoryMetadataPtr(); + return metadata ? metadata->getSampleBlock() : Block(); +} + Chunk StorageEmbeddedRocksDB::getBySerializedKeys( const std::vector & keys, PaddedPODArray * null_map) const diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h index 038788f1710..da98dca8262 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include @@ -23,7 +23,7 @@ class Context; /// Operates with rocksdb data structures via rocksdb API (holds pointer to rocksdb::DB inside for that). /// Storage have one primary key. /// Values are serialized into raw strings to store in rocksdb. -class StorageEmbeddedRocksDB final : public IKeyValueStorage, WithContext +class StorageEmbeddedRocksDB final : public IStorage, public IKeyValueEntity, WithContext { friend class EmbeddedRocksDBSink; public: @@ -65,6 +65,8 @@ public: Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & null_map) const override; + Block getSampleBlock() const override; + /// Return chunk with data for given serialized keys. /// If out_null_map is passed, fill it with 1/0 depending on key was/wasn't found. Result chunk may contain default values. /// If out_null_map is not passed. Not found rows excluded from result chunk. From 90fa2ed8c1b5d889f6a37b01cfa95ddf2c1637d3 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 8 Aug 2022 10:58:28 +0000 Subject: [PATCH 118/164] better code for join with dict --- src/Dictionaries/IDictionary.h | 11 +- src/Dictionaries/IDictionaryKeyValue.cpp | 116 ++++++++++++++++++ src/Interpreters/DictionaryJoinAdapter.cpp | 95 +------------- src/Interpreters/DictionaryJoinAdapter.h | 25 ---- src/Interpreters/DirectJoin.cpp | 10 +- src/Interpreters/DirectJoin.h | 4 +- src/Interpreters/ExpressionAnalyzer.cpp | 88 ++----------- src/Interpreters/IKeyValueEntity.h | 12 +- src/Interpreters/JoinedTables.cpp | 24 +++- src/Interpreters/TableJoin.cpp | 2 +- src/Interpreters/TableJoin.h | 6 +- .../RocksDB/StorageEmbeddedRocksDB.cpp | 5 +- src/Storages/RocksDB/StorageEmbeddedRocksDB.h | 4 +- 13 files changed, 183 insertions(+), 219 deletions(-) create mode 100644 src/Dictionaries/IDictionaryKeyValue.cpp diff --git a/src/Dictionaries/IDictionary.h b/src/Dictionaries/IDictionary.h index 3f3c60206d6..61787aa38a5 100644 --- a/src/Dictionaries/IDictionary.h +++ b/src/Dictionaries/IDictionary.h @@ -5,16 +5,18 @@ #include #include +#include #include #include +#include #include #include #include #include - namespace DB { + namespace ErrorCodes { extern const int NOT_IMPLEMENTED; @@ -52,7 +54,7 @@ enum class DictionarySpecialKeyType /** * Base class for Dictionaries implementation. */ -class IDictionary : public IExternalLoadable +class IDictionary : public IExternalLoadable, public IKeyValueEntity { public: explicit IDictionary(const StorageID & dictionary_id_) @@ -290,6 +292,11 @@ public: return dictionary_comment; } + /// IKeyValueEntity implementation + Names getPrimaryKey() const override; + Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map, const Names & result_names) const override; + Block getSampleBlock(const Names & result_names) const override; + private: mutable std::mutex mutex; mutable StorageID dictionary_id TSA_GUARDED_BY(mutex); diff --git a/src/Dictionaries/IDictionaryKeyValue.cpp b/src/Dictionaries/IDictionaryKeyValue.cpp new file mode 100644 index 00000000000..d871ff55796 --- /dev/null +++ b/src/Dictionaries/IDictionaryKeyValue.cpp @@ -0,0 +1,116 @@ +#include +#include + + +namespace DB +{ + +static void splitNamesAndTypesFromStructure(const DictionaryStructure & structure, const Names & result_names, Names & attribute_names, DataTypes & result_types) +{ + if (!result_names.empty()) + { + for (const auto & attr_name : result_names) + { + if (!structure.hasAttribute(attr_name)) + continue; /// skip keys + const auto & attr = structure.getAttribute(attr_name); + attribute_names.emplace_back(attr.name); + result_types.emplace_back(attr.type); + } + } + else + { + /// If result_names is empty, then use all attributes from structure + for (const auto & attr : structure.attributes) + { + attribute_names.emplace_back(attr.name); + result_types.emplace_back(attr.type); + } + } +} + +Names IDictionary::getPrimaryKey() const +{ + return getStructure().getKeysNames(); +} + +Chunk IDictionary::getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map, const Names & result_names) const +{ + if (keys.empty()) + return Chunk(getSampleBlock(result_names).cloneEmpty().getColumns(), 0); + + const auto & dictionary_structure = getStructure(); + + /// Split column keys and types into separate vectors, to use in `IDictionary::getColumns` + Columns key_columns; + DataTypes key_types; + for (const auto & key : keys) + { + key_columns.emplace_back(key.column); + key_types.emplace_back(key.type); + } + + /// Fill null map + { + out_null_map.clear(); + + auto mask = hasKeys(key_columns, key_types); + const auto & mask_data = mask->getData(); + + out_null_map.resize(mask_data.size(), 0); + std::copy(mask_data.begin(), mask_data.end(), out_null_map.begin()); + } + + Names attribute_names; + DataTypes result_types; + splitNamesAndTypesFromStructure(dictionary_structure, result_names, attribute_names, result_types); + + Columns default_cols(result_types.size()); + for (size_t i = 0; i < result_types.size(); ++i) + /// Dictinonary may have non-standart default values specified + default_cols[i] = result_types[i]->createColumnConstWithDefaultValue(out_null_map.size()); + + Columns result_columns = getColumns(attribute_names, result_types, key_columns, key_types, default_cols); + + /// Result block should consist of key columns and then attributes + for (const auto & key_col : key_columns) + { + /// Insert default values for keys that were not found + ColumnPtr filtered_key_col = JoinCommon::filterWithBlanks(key_col, out_null_map); + result_columns.insert(result_columns.begin(), filtered_key_col); + } + + size_t num_rows = result_columns[0]->size(); + return Chunk(std::move(result_columns), num_rows); +} + +Block IDictionary::getSampleBlock(const Names & result_names) const +{ + const auto & dictionary_structure = getStructure(); + const auto & key_types = dictionary_structure.getKeyTypes(); + const auto & key_names = dictionary_structure.getKeysNames(); + + Block sample_block; + + for (size_t i = 0; i < key_types.size(); ++i) + sample_block.insert(ColumnWithTypeAndName(nullptr, key_types.at(i), key_names.at(i))); + + if (result_names.empty()) + { + for (const auto & attr : dictionary_structure.attributes) + sample_block.insert(ColumnWithTypeAndName(nullptr, attr.type, attr.name)); + } + else + { + for (const auto & attr_name : result_names) + { + if (!dictionary_structure.hasAttribute(attr_name)) + continue; /// skip keys + const auto & attr = dictionary_structure.getAttribute(attr_name); + sample_block.insert(ColumnWithTypeAndName(nullptr, attr.type, attr_name)); + } + } + return sample_block; +} + +} diff --git a/src/Interpreters/DictionaryJoinAdapter.cpp b/src/Interpreters/DictionaryJoinAdapter.cpp index d55c2f3bad5..bf0ad373204 100644 --- a/src/Interpreters/DictionaryJoinAdapter.cpp +++ b/src/Interpreters/DictionaryJoinAdapter.cpp @@ -1,101 +1,8 @@ #include -#include -#include -#include -#include -#include -#include -#include -#include -#include + namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -DictionaryJoinAdapter::DictionaryJoinAdapter( - std::shared_ptr dictionary_, const Names & result_column_names) - : IKeyValueEntity() - , dictionary(dictionary_) -{ - if (!dictionary) - throw Exception("Dictionary is not initialized", ErrorCodes::LOGICAL_ERROR); - - const auto & key_types = dictionary->getStructure().getKeyTypes(); - const auto & key_names = dictionary->getStructure().getKeysNames(); - if (key_types.size() != key_names.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary '{}' has invalid structure", dictionary->getFullName()); - - for (size_t i = 0; i < key_types.size(); ++i) - { - sample_block.insert(ColumnWithTypeAndName(nullptr, key_types[i], key_names[i])); - } - - for (const auto & attr_name : result_column_names) - { - const auto & attr = dictionary->getStructure().getAttribute(attr_name); - - sample_block.insert(ColumnWithTypeAndName(nullptr, attr.type, attr_name)); - - attribute_names.emplace_back(attr_name); - result_types.emplace_back(attr.type); - } -} - -Names DictionaryJoinAdapter::getPrimaryKey() const -{ - return dictionary->getStructure().getKeysNames(); -} - -Block DictionaryJoinAdapter::getSampleBlock() const -{ - return sample_block; -} - -Chunk DictionaryJoinAdapter::getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map) const -{ - if (keys.empty()) - return {}; - - Columns key_columns; - DataTypes key_types; - for (const auto & key : keys) - { - key_columns.emplace_back(key.column); - key_types.emplace_back(key.type); - } - - { - out_null_map.clear(); - - auto mask = dictionary->hasKeys(key_columns, key_types); - const auto & mask_data = mask->getData(); - - out_null_map.resize(mask_data.size(), 0); - std::copy(mask_data.begin(), mask_data.end(), out_null_map.begin()); - } - - Columns default_cols(result_types.size()); - for (size_t i = 0; i < result_types.size(); ++i) - /// Dictinonary may have non-standart default values specified - default_cols[i] = result_types[i]->createColumnConstWithDefaultValue(out_null_map.size()); - - /// Result block consists of key columns and then attributes - Columns result_columns = dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_cols); - - for (const auto & key_col : key_columns) - { - /// Insert default values for keys that were not found - ColumnPtr filtered_key_col = JoinCommon::filterWithBlanks(key_col, out_null_map); - result_columns.insert(result_columns.begin(), filtered_key_col); - } - - size_t num_rows = result_columns[0]->size(); - return Chunk(std::move(result_columns), num_rows); -} } diff --git a/src/Interpreters/DictionaryJoinAdapter.h b/src/Interpreters/DictionaryJoinAdapter.h index 888c1485831..dade5da94e6 100644 --- a/src/Interpreters/DictionaryJoinAdapter.h +++ b/src/Interpreters/DictionaryJoinAdapter.h @@ -1,32 +1,7 @@ #pragma once -#include -#include -#include -#include namespace DB { -/// Used in join with dictionary to provide sufficient interface to DirectJoin -class DictionaryJoinAdapter : public IKeyValueEntity -{ -public: - DictionaryJoinAdapter(std::shared_ptr dictionary_, const Names & result_column_names); - - Names getPrimaryKey() const override; - - Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map) const override; - - Block getSampleBlock() const override; - -private: - std::shared_ptr dictionary; - - Block sample_block; - - Strings attribute_names; - DataTypes result_types; -}; - } diff --git a/src/Interpreters/DirectJoin.cpp b/src/Interpreters/DirectJoin.cpp index 2bae1afd7a5..02b3854a47b 100644 --- a/src/Interpreters/DirectJoin.cpp +++ b/src/Interpreters/DirectJoin.cpp @@ -63,7 +63,7 @@ static MutableColumns convertBlockStructure( DirectKeyValueJoin::DirectKeyValueJoin(std::shared_ptr table_join_, const Block & right_sample_block_, - std::shared_ptr storage_) + std::shared_ptr storage_) : table_join(table_join_) , storage(storage_) , right_sample_block(right_sample_block_) @@ -113,12 +113,14 @@ void DirectKeyValueJoin::joinBlock(Block & block, std::shared_ptr &) if (!key_col.column) return; + Block original_right_block = originalRightBlock(right_sample_block, *table_join); + const Names & attribute_names = original_right_block.getNames(); + NullMap null_map; - Chunk joined_chunk = storage->getByKeys({key_col}, null_map); + Chunk joined_chunk = storage->getByKeys({key_col}, null_map, attribute_names); /// Expected right block may differ from structure in storage, because of `join_use_nulls` or we just select not all joined attributes - Block original_right_block = originalRightBlock(right_sample_block, *table_join); - Block sample_storage_block = storage->getSampleBlock(); + Block sample_storage_block = storage->getSampleBlock(attribute_names); MutableColumns result_columns = convertBlockStructure(sample_storage_block, original_right_block, joined_chunk.mutateColumns(), null_map); for (size_t i = 0; i < result_columns.size(); ++i) diff --git a/src/Interpreters/DirectJoin.h b/src/Interpreters/DirectJoin.h index 9aeca9b3a16..8e82b59da02 100644 --- a/src/Interpreters/DirectJoin.h +++ b/src/Interpreters/DirectJoin.h @@ -23,7 +23,7 @@ public: DirectKeyValueJoin( std::shared_ptr table_join_, const Block & right_sample_block_, - std::shared_ptr storage_); + std::shared_ptr storage_); virtual const TableJoin & getTableJoin() const override { return *table_join; } @@ -50,7 +50,7 @@ public: private: std::shared_ptr table_join; - std::shared_ptr storage; + std::shared_ptr storage; Block right_sample_block; Block sample_block_with_columns_to_add; Poco::Logger * log; diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 80f8fda252c..dffe96d0a7b 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -1008,7 +1008,6 @@ static ActionsDAGPtr createJoinedBlockActions(ContextPtr context, const TableJoi return ExpressionAnalyzer(expression_list, syntax_result, context).getActionsDAG(true, false); } -std::shared_ptr tryDictJoin(std::shared_ptr analyzed_join, const Block & right_sample_block, ContextPtr context); std::shared_ptr tryKeyValueJoin(std::shared_ptr analyzed_join, const Block & right_sample_block); static std::shared_ptr chooseJoinAlgorithm(std::shared_ptr analyzed_join, std::unique_ptr & joined_plan, ContextPtr context) @@ -1017,10 +1016,7 @@ static std::shared_ptr chooseJoinAlgorithm(std::shared_ptr ana if (analyzed_join->isEnabledAlgorithm(JoinAlgorithm::DIRECT)) { - JoinPtr direct_join = nullptr; - direct_join = direct_join ? direct_join : tryKeyValueJoin(analyzed_join, right_sample_block); - direct_join = direct_join ? direct_join : tryDictJoin(analyzed_join, right_sample_block, context); - + JoinPtr direct_join = tryKeyValueJoin(analyzed_join, right_sample_block); if (direct_join) { /// Do not need to execute plan for right part, it's ready. @@ -1112,67 +1108,6 @@ static std::unique_ptr buildJoinedPlan( return joined_plan; } -std::shared_ptr tryDictJoin(std::shared_ptr analyzed_join, const Block & right_sample_block, ContextPtr context) -{ - bool allowed_inner = isInner(analyzed_join->kind()) && analyzed_join->strictness() == JoinStrictness::All; - bool allowed_left = isLeft(analyzed_join->kind()) && (analyzed_join->strictness() == JoinStrictness::Any || - analyzed_join->strictness() == JoinStrictness::All || - analyzed_join->strictness() == JoinStrictness::Semi || - analyzed_join->strictness() == JoinStrictness::Anti); - if (!allowed_inner && !allowed_left) - { - LOG_TRACE(getLogger(), "Can't use dictionary join: {} {} is not supported", - analyzed_join->kind(), analyzed_join->strictness()); - return nullptr; - } - - if (analyzed_join->getClauses().size() != 1 || analyzed_join->getClauses()[0].key_names_right.size() != 1) - { - LOG_TRACE(getLogger(), "Can't use dictionary join: only one key is supported"); - return nullptr; - } - - const auto & right_key = analyzed_join->getOnlyClause().key_names_right[0]; - - const auto & dictionary_name = analyzed_join->getRightStorageName(); - if (dictionary_name.empty()) - { - LOG_TRACE(getLogger(), "Can't use dictionary join: dictionary was not found"); - return nullptr; - } - - FunctionDictHelper dictionary_helper(context); - - auto dictionary = dictionary_helper.getDictionary(dictionary_name); - if (!dictionary) - { - LOG_TRACE(getLogger(), "Can't use dictionary join: dictionary was not found"); - return nullptr; - } - - const auto & dict_keys = dictionary->getStructure().getKeysNames(); - if (dict_keys.size() != 1 || dict_keys[0] != analyzed_join->getOriginalName(right_key)) - { - LOG_TRACE(getLogger(), "Can't use dictionary join: join key '{}' doesn't natch to dictionary key ({})", - right_key, fmt::join(dict_keys, ", ")); - return nullptr; - } - - Names attr_names; - for (const auto & col : right_sample_block) - { - if (col.name == right_key) - continue; - - const auto & original_name = analyzed_join->getOriginalName(col.name); - if (dictionary->getStructure().hasAttribute(original_name)) - attr_names.push_back(original_name); - } - - auto dict_reader = std::make_shared(dictionary, attr_names); - return std::make_shared(analyzed_join, right_sample_block, dict_reader); -} - std::shared_ptr tryKeyValueJoin(std::shared_ptr analyzed_join, const Block & right_sample_block) { if (!analyzed_join->isEnabledAlgorithm(JoinAlgorithm::DIRECT)) @@ -1180,19 +1115,17 @@ std::shared_ptr tryKeyValueJoin(std::shared_ptr a auto storage = analyzed_join->getStorageKeyValue(); if (!storage) - { return nullptr; - } - if (!isInnerOrLeft(analyzed_join->kind())) - { - return nullptr; - } - - if (analyzed_join->strictness() != JoinStrictness::All && - analyzed_join->strictness() != JoinStrictness::Any && - analyzed_join->strictness() != JoinStrictness::RightAny) + bool allowed_inner = isInner(analyzed_join->kind()) && analyzed_join->strictness() == JoinStrictness::All; + bool allowed_left = isLeft(analyzed_join->kind()) && (analyzed_join->strictness() == JoinStrictness::Any || + analyzed_join->strictness() == JoinStrictness::All || + analyzed_join->strictness() == JoinStrictness::Semi || + analyzed_join->strictness() == JoinStrictness::Anti); + if (!allowed_inner && !allowed_left) { + LOG_TRACE(getLogger(), "Can't use direct join: {} {} is not supported", + analyzed_join->kind(), analyzed_join->strictness()); return nullptr; } @@ -1205,6 +1138,7 @@ std::shared_ptr tryKeyValueJoin(std::shared_ptr a if (!only_one_key) { + LOG_TRACE(getLogger(), "Can't use direct join: only one key is supported"); return nullptr; } @@ -1213,6 +1147,8 @@ std::shared_ptr tryKeyValueJoin(std::shared_ptr a const auto & storage_primary_key = storage->getPrimaryKey(); if (storage_primary_key.size() != 1 || storage_primary_key[0] != original_key_name) { + LOG_TRACE(getLogger(), "Can't use direct join: join key '{}' doesn't match to storage key ({})", + original_key_name, fmt::join(storage_primary_key, ", ")); return nullptr; } diff --git a/src/Interpreters/IKeyValueEntity.h b/src/Interpreters/IKeyValueEntity.h index c1de1a2a9c5..d1ceda57f0e 100644 --- a/src/Interpreters/IKeyValueEntity.h +++ b/src/Interpreters/IKeyValueEntity.h @@ -10,6 +10,9 @@ namespace DB class IKeyValueEntity { public: + IKeyValueEntity() = default; + virtual ~IKeyValueEntity() = default; + /// Get primary key name that supports key-value requests. /// Primary key can constist of multiple columns. virtual Names getPrimaryKey() const = 0; @@ -19,17 +22,20 @@ public: * * @param keys - keys to get data for. Key can be compound and represented by several columns. * @param out_null_map - output parameter indicating which keys were not found. + * @param required_columns - if we don't need all attributes, implementation can use it to benefit from reading a subset of them. * * @return - chunk of data corresponding for keys. * Number of rows in chunk is equal to size of columns in keys. * If the key was not found row would have a default value. */ - virtual Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map) const = 0; + virtual Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map, const Names & required_columns) const = 0; /// Header for getByKeys result - virtual Block getSampleBlock() const = 0; + virtual Block getSampleBlock(const Names & required_columns) const = 0; - virtual ~IKeyValueEntity() = default; +protected: + /// Names of result columns. If empty then all columns are required. + Names key_value_result_names; }; } diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp index c365239e6d8..f4a98ada199 100644 --- a/src/Interpreters/JoinedTables.cpp +++ b/src/Interpreters/JoinedTables.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -320,13 +321,26 @@ std::shared_ptr JoinedTables::makeTableJoin(const ASTSelectQuery & se { table_join->setStorageJoin(storage_join); } - else if (auto storage_dict = std::dynamic_pointer_cast(storage); - storage_dict && join_algorithm.isSet(JoinAlgorithm::DIRECT)) + + if (auto storage_dict = std::dynamic_pointer_cast(storage); + storage_dict && join_algorithm.isSet(JoinAlgorithm::DIRECT)) { - table_join->setRightStorageName(storage_dict->getDictionaryName()); + FunctionDictHelper dictionary_helper(context); + + auto dictionary_name = storage_dict->getDictionaryName(); + auto dictionary = dictionary_helper.getDictionary(dictionary_name); + if (!dictionary) + { + LOG_TRACE(&Poco::Logger::get("JoinedTables"), "Can't use dictionary join: dictionary '{}' was not found", dictionary_name); + return nullptr; + } + + auto dictionary_kv = std::dynamic_pointer_cast(dictionary); + table_join->setStorageJoin(dictionary_kv); } - else if (auto storage_kv = std::dynamic_pointer_cast(storage); - storage_kv && join_algorithm.isSet(JoinAlgorithm::DIRECT)) + + if (auto storage_kv = std::dynamic_pointer_cast(storage); + storage_kv && join_algorithm.isSet(JoinAlgorithm::DIRECT)) { table_join->setStorageJoin(storage_kv); } diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 67f66fb9694..5d065e564b2 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -645,7 +645,7 @@ ActionsDAGPtr TableJoin::applyKeyConvertToTable( return dag_stage1; } -void TableJoin::setStorageJoin(std::shared_ptr storage) +void TableJoin::setStorageJoin(std::shared_ptr storage) { right_kv_storage = storage; } diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 1866ad2a5fd..d0bf64fdebe 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -140,7 +140,7 @@ private: std::shared_ptr right_storage_join; - std::shared_ptr right_kv_storage; + std::shared_ptr right_kv_storage; std::string right_storage_name; @@ -304,14 +304,14 @@ public: void setRightStorageName(const std::string & storage_name); const std::string & getRightStorageName() const; - void setStorageJoin(std::shared_ptr storage); + void setStorageJoin(std::shared_ptr storage); void setStorageJoin(std::shared_ptr storage); std::shared_ptr getStorageJoin() { return right_storage_join; } bool isSpecialStorage() const { return !right_storage_name.empty() || right_storage_join || right_kv_storage; } - std::shared_ptr getStorageKeyValue() { return right_kv_storage; } + std::shared_ptr getStorageKeyValue() { return right_kv_storage; } }; } diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index 90cfbf910dd..2d82986eb25 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -559,7 +559,8 @@ std::vector StorageEmbeddedRocksDB::multiGet(const std::vector< Chunk StorageEmbeddedRocksDB::getByKeys( const ColumnsWithTypeAndName & keys, - PaddedPODArray & null_map) const + PaddedPODArray & null_map, + const Names &) const { if (keys.size() != 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "StorageEmbeddedRocksDB supports only one key, got: {}", keys.size()); @@ -572,7 +573,7 @@ Chunk StorageEmbeddedRocksDB::getByKeys( return getBySerializedKeys(raw_keys, &null_map); } -Block StorageEmbeddedRocksDB::getSampleBlock() const +Block StorageEmbeddedRocksDB::getSampleBlock(const Names &) const { auto metadata = getInMemoryMetadataPtr(); return metadata ? metadata->getSampleBlock() : Block(); diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h index da98dca8262..55770516b3f 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h @@ -63,9 +63,9 @@ public: std::vector multiGet(const std::vector & slices_keys, std::vector & values) const; Names getPrimaryKey() const override { return {primary_key}; } - Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & null_map) const override; + Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & null_map, const Names &) const override; - Block getSampleBlock() const override; + Block getSampleBlock(const Names &) const override; /// Return chunk with data for given serialized keys. /// If out_null_map is passed, fill it with 1/0 depending on key was/wasn't found. Result chunk may contain default values. From e2929cdea5acdcdb16c962848076e6b4a67e2079 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 8 Aug 2022 11:40:09 +0000 Subject: [PATCH 119/164] Add semi and anti to 02242_join_rocksdb.sql --- .../queries/0_stateless/02242_join_rocksdb.reference | 12 ++++++++++++ tests/queries/0_stateless/02242_join_rocksdb.sql | 9 +++++++++ 2 files changed, 21 insertions(+) diff --git a/tests/queries/0_stateless/02242_join_rocksdb.reference b/tests/queries/0_stateless/02242_join_rocksdb.reference index b1f7307ff4f..5f2ac6c11b7 100644 --- a/tests/queries/0_stateless/02242_join_rocksdb.reference +++ b/tests/queries/0_stateless/02242_join_rocksdb.reference @@ -14,6 +14,18 @@ 7 [7,8] val27 8 [8,9] val28 9 [9,10] val29 +-- left semi +0 +2 +3 +6 +7 +8 +9 +-- left anti +1 +4 +5 -- join_use_nulls left 0 0 Nullable(String) val20 1 \N Nullable(String) \N diff --git a/tests/queries/0_stateless/02242_join_rocksdb.sql b/tests/queries/0_stateless/02242_join_rocksdb.sql index 34b3d120eae..9d4cdb7af78 100644 --- a/tests/queries/0_stateless/02242_join_rocksdb.sql +++ b/tests/queries/0_stateless/02242_join_rocksdb.sql @@ -26,6 +26,12 @@ SELECT * FROM (SELECT k as key FROM t2) as t2 INNER JOIN rdb ON rdb.key == t2.ke SELECT '-- using'; SELECT * FROM (SELECT k as key FROM t2) as t2 INNER JOIN rdb USING key ORDER BY key; +SELECT '-- left semi'; +SELECT k FROM t2 LEFT SEMI JOIN rdb ON rdb.key == t2.k ORDER BY k; + +SELECT '-- left anti'; +SELECT k FROM t2 LEFT ANTI JOIN rdb ON rdb.key == t2.k ORDER BY k; + SELECT '-- join_use_nulls left'; SELECT k, key, toTypeName(value2), value2 FROM t2 LEFT JOIN rdb ON rdb.key == t2.k ORDER BY k SETTINGS join_use_nulls = 1; @@ -59,6 +65,9 @@ SELECT * FROM t1 INNER JOIN rdb ON rdb.key + 1 == t1.k FORMAT Null SETTINGS join SELECT * FROM t1 INNER JOIN (SELECT * FROM rdb) AS rdb ON rdb.key == t1.k; -- { serverError NOT_IMPLEMENTED } SELECT * FROM t1 INNER JOIN (SELECT * FROM rdb) AS rdb ON rdb.key == t1.k FORMAT Null SETTINGS join_algorithm = 'direct,hash'; +SELECT * FROM t1 RIGHT SEMI JOIN (SELECT * FROM rdb) AS rdb ON rdb.key == t1.k; -- { serverError NOT_IMPLEMENTED } +SELECT * FROM t1 RIGHT ANTI JOIN (SELECT * FROM rdb) AS rdb ON rdb.key == t1.k; -- { serverError NOT_IMPLEMENTED } + DROP TABLE IF EXISTS rdb; DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; From 32c63f43a113a5c2b12c573faf91f3e59988cfd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 10 Aug 2022 17:54:56 +0200 Subject: [PATCH 120/164] Don't visit the AST for UDFs if none are registered --- src/Interpreters/TreeRewriter.cpp | 44 ++++++++++--------- .../UserDefinedSQLFunctionFactory.cpp | 5 +++ .../UserDefinedSQLFunctionFactory.h | 3 ++ 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 39f3453c7c4..9248e8eecb6 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -6,29 +6,30 @@ #include #include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include +#include +#include #include /// getSmallestColumn() -#include -#include -#include -#include +#include +#include +#include +#include +#include #include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -1405,8 +1406,11 @@ TreeRewriterResultPtr TreeRewriter::analyze( void TreeRewriter::normalize( ASTPtr & query, Aliases & aliases, const NameSet & source_columns_set, bool ignore_alias, const Settings & settings, bool allow_self_aliases, ContextPtr context_) { - UserDefinedSQLFunctionVisitor::Data data_user_defined_functions_visitor; - UserDefinedSQLFunctionVisitor(data_user_defined_functions_visitor).visit(query); + if (!UserDefinedSQLFunctionFactory::instance().empty()) + { + UserDefinedSQLFunctionVisitor::Data data_user_defined_functions_visitor; + UserDefinedSQLFunctionVisitor(data_user_defined_functions_visitor).visit(query); + } CustomizeCountDistinctVisitor::Data data_count_distinct{settings.count_distinct_implementation}; CustomizeCountDistinctVisitor(data_count_distinct).visit(query); diff --git a/src/Interpreters/UserDefinedSQLFunctionFactory.cpp b/src/Interpreters/UserDefinedSQLFunctionFactory.cpp index 698faac5fab..db11ee12b03 100644 --- a/src/Interpreters/UserDefinedSQLFunctionFactory.cpp +++ b/src/Interpreters/UserDefinedSQLFunctionFactory.cpp @@ -160,4 +160,9 @@ std::vector UserDefinedSQLFunctionFactory::getAllRegisteredNames() return registered_names; } +bool UserDefinedSQLFunctionFactory::empty() const +{ + std::lock_guard lock(mutex); + return function_name_to_create_query.size() == 0; +} } diff --git a/src/Interpreters/UserDefinedSQLFunctionFactory.h b/src/Interpreters/UserDefinedSQLFunctionFactory.h index 63bf5d73c15..db43bb7298e 100644 --- a/src/Interpreters/UserDefinedSQLFunctionFactory.h +++ b/src/Interpreters/UserDefinedSQLFunctionFactory.h @@ -43,6 +43,9 @@ public: /// Get all user defined functions registered names. std::vector getAllRegisteredNames() const override; + /// Check whether any UDFs have been registered + bool empty() const; + private: std::unordered_map function_name_to_create_query; mutable std::mutex mutex; From 5fea2091ac227044b828df7ce2608cd8e2b066ad Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 10 Aug 2022 14:45:22 +0000 Subject: [PATCH 121/164] Embed IKeyValue impl into IDictionary.h --- src/Dictionaries/IDictionary.h | 103 +++++++++++++++- src/Dictionaries/IDictionaryKeyValue.cpp | 116 ------------------ .../RocksDB/StorageEmbeddedRocksDB.cpp | 1 - 3 files changed, 100 insertions(+), 120 deletions(-) delete mode 100644 src/Dictionaries/IDictionaryKeyValue.cpp diff --git a/src/Dictionaries/IDictionary.h b/src/Dictionaries/IDictionary.h index 61787aa38a5..d82f89b7473 100644 --- a/src/Dictionaries/IDictionary.h +++ b/src/Dictionaries/IDictionary.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -293,9 +294,105 @@ public: } /// IKeyValueEntity implementation - Names getPrimaryKey() const override; - Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map, const Names & result_names) const override; - Block getSampleBlock(const Names & result_names) const override; + Names getPrimaryKey() const override { return getStructure().getKeysNames(); } + + Chunk getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map, const Names & result_names) const override + { + if (keys.empty()) + return Chunk(getSampleBlock(result_names).cloneEmpty().getColumns(), 0); + + const auto & dictionary_structure = getStructure(); + + /// Split column keys and types into separate vectors, to use in `IDictionary::getColumns` + Columns key_columns; + DataTypes key_types; + for (const auto & key : keys) + { + key_columns.emplace_back(key.column); + key_types.emplace_back(key.type); + } + + /// Fill null map + { + out_null_map.clear(); + + auto mask = hasKeys(key_columns, key_types); + const auto & mask_data = mask->getData(); + + out_null_map.resize(mask_data.size(), 0); + std::copy(mask_data.begin(), mask_data.end(), out_null_map.begin()); + } + + Names attribute_names; + DataTypes result_types; + if (!result_names.empty()) + { + for (const auto & attr_name : result_names) + { + if (!dictionary_structure.hasAttribute(attr_name)) + continue; /// skip keys + const auto & attr = dictionary_structure.getAttribute(attr_name); + attribute_names.emplace_back(attr.name); + result_types.emplace_back(attr.type); + } + } + else + { + /// If result_names is empty, then use all attributes from dictionary_structure + for (const auto & attr : dictionary_structure.attributes) + { + attribute_names.emplace_back(attr.name); + result_types.emplace_back(attr.type); + } + } + + Columns default_cols(result_types.size()); + for (size_t i = 0; i < result_types.size(); ++i) + /// Dictinonary may have non-standart default values specified + default_cols[i] = result_types[i]->createColumnConstWithDefaultValue(out_null_map.size()); + + Columns result_columns = getColumns(attribute_names, result_types, key_columns, key_types, default_cols); + + /// Result block should consist of key columns and then attributes + for (const auto & key_col : key_columns) + { + /// Insert default values for keys that were not found + ColumnPtr filtered_key_col = JoinCommon::filterWithBlanks(key_col, out_null_map); + result_columns.insert(result_columns.begin(), filtered_key_col); + } + + size_t num_rows = result_columns[0]->size(); + return Chunk(std::move(result_columns), num_rows); + } + + Block getSampleBlock(const Names & result_names) const override + { + const auto & dictionary_structure = getStructure(); + const auto & key_types = dictionary_structure.getKeyTypes(); + const auto & key_names = dictionary_structure.getKeysNames(); + + Block sample_block; + + for (size_t i = 0; i < key_types.size(); ++i) + sample_block.insert(ColumnWithTypeAndName(nullptr, key_types.at(i), key_names.at(i))); + + if (result_names.empty()) + { + for (const auto & attr : dictionary_structure.attributes) + sample_block.insert(ColumnWithTypeAndName(nullptr, attr.type, attr.name)); + } + else + { + for (const auto & attr_name : result_names) + { + if (!dictionary_structure.hasAttribute(attr_name)) + continue; /// skip keys + const auto & attr = dictionary_structure.getAttribute(attr_name); + sample_block.insert(ColumnWithTypeAndName(nullptr, attr.type, attr_name)); + } + } + return sample_block; + } private: mutable std::mutex mutex; diff --git a/src/Dictionaries/IDictionaryKeyValue.cpp b/src/Dictionaries/IDictionaryKeyValue.cpp deleted file mode 100644 index d871ff55796..00000000000 --- a/src/Dictionaries/IDictionaryKeyValue.cpp +++ /dev/null @@ -1,116 +0,0 @@ -#include -#include - - -namespace DB -{ - -static void splitNamesAndTypesFromStructure(const DictionaryStructure & structure, const Names & result_names, Names & attribute_names, DataTypes & result_types) -{ - if (!result_names.empty()) - { - for (const auto & attr_name : result_names) - { - if (!structure.hasAttribute(attr_name)) - continue; /// skip keys - const auto & attr = structure.getAttribute(attr_name); - attribute_names.emplace_back(attr.name); - result_types.emplace_back(attr.type); - } - } - else - { - /// If result_names is empty, then use all attributes from structure - for (const auto & attr : structure.attributes) - { - attribute_names.emplace_back(attr.name); - result_types.emplace_back(attr.type); - } - } -} - -Names IDictionary::getPrimaryKey() const -{ - return getStructure().getKeysNames(); -} - -Chunk IDictionary::getByKeys(const ColumnsWithTypeAndName & keys, PaddedPODArray & out_null_map, const Names & result_names) const -{ - if (keys.empty()) - return Chunk(getSampleBlock(result_names).cloneEmpty().getColumns(), 0); - - const auto & dictionary_structure = getStructure(); - - /// Split column keys and types into separate vectors, to use in `IDictionary::getColumns` - Columns key_columns; - DataTypes key_types; - for (const auto & key : keys) - { - key_columns.emplace_back(key.column); - key_types.emplace_back(key.type); - } - - /// Fill null map - { - out_null_map.clear(); - - auto mask = hasKeys(key_columns, key_types); - const auto & mask_data = mask->getData(); - - out_null_map.resize(mask_data.size(), 0); - std::copy(mask_data.begin(), mask_data.end(), out_null_map.begin()); - } - - Names attribute_names; - DataTypes result_types; - splitNamesAndTypesFromStructure(dictionary_structure, result_names, attribute_names, result_types); - - Columns default_cols(result_types.size()); - for (size_t i = 0; i < result_types.size(); ++i) - /// Dictinonary may have non-standart default values specified - default_cols[i] = result_types[i]->createColumnConstWithDefaultValue(out_null_map.size()); - - Columns result_columns = getColumns(attribute_names, result_types, key_columns, key_types, default_cols); - - /// Result block should consist of key columns and then attributes - for (const auto & key_col : key_columns) - { - /// Insert default values for keys that were not found - ColumnPtr filtered_key_col = JoinCommon::filterWithBlanks(key_col, out_null_map); - result_columns.insert(result_columns.begin(), filtered_key_col); - } - - size_t num_rows = result_columns[0]->size(); - return Chunk(std::move(result_columns), num_rows); -} - -Block IDictionary::getSampleBlock(const Names & result_names) const -{ - const auto & dictionary_structure = getStructure(); - const auto & key_types = dictionary_structure.getKeyTypes(); - const auto & key_names = dictionary_structure.getKeysNames(); - - Block sample_block; - - for (size_t i = 0; i < key_types.size(); ++i) - sample_block.insert(ColumnWithTypeAndName(nullptr, key_types.at(i), key_names.at(i))); - - if (result_names.empty()) - { - for (const auto & attr : dictionary_structure.attributes) - sample_block.insert(ColumnWithTypeAndName(nullptr, attr.type, attr.name)); - } - else - { - for (const auto & attr_name : result_names) - { - if (!dictionary_structure.hasAttribute(attr_name)) - continue; /// skip keys - const auto & attr = dictionary_structure.getAttribute(attr_name); - sample_block.insert(ColumnWithTypeAndName(nullptr, attr.type, attr_name)); - } - } - return sample_block; -} - -} diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index 2d82986eb25..3c277abb693 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -350,7 +350,6 @@ StorageEmbeddedRocksDB::StorageEmbeddedRocksDB(const StorageID & table_id_, ContextPtr context_, const String & primary_key_) : IStorage(table_id_) - , IKeyValueEntity() , WithContext(context_->getGlobalContext()) , primary_key{primary_key_} { From 7f54fa726b8d6f2207bad8bdde4effed980e8eaa Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Tue, 14 Jun 2022 19:08:27 +0800 Subject: [PATCH 122/164] Decoupling cache functions and algorithms --- src/Common/FileSegment.h | 2 +- src/Common/IFileCachePriority.h | 92 ++++++++++++ src/Common/LRUFileCache.cpp | 169 ++++++++++++---------- src/Common/tests/gtest_lru_file_cache.cpp | 8 +- 4 files changed, 186 insertions(+), 85 deletions(-) create mode 100644 src/Common/IFileCachePriority.h diff --git a/src/Common/FileSegment.h b/src/Common/FileSegment.h index 93cbf269a8e..4404d0e14be 100644 --- a/src/Common/FileSegment.h +++ b/src/Common/FileSegment.h @@ -27,7 +27,7 @@ using FileSegments = std::list; class FileSegment : boost::noncopyable { -friend class LRUFileCache; +friend class FileCache; friend struct FileSegmentsHolder; friend class FileSegmentRangeWriter; diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h new file mode 100644 index 00000000000..35b82d61228 --- /dev/null +++ b/src/Common/IFileCachePriority.h @@ -0,0 +1,92 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +class IFileCachePriority; +using FileCachePriorityPtr = std::shared_ptr; + +/// IFileCachePriority is used to maintain the priority of cached data. +class IFileCachePriority +{ +public: + using Key = UInt128; + + class IIterator; + friend class IIterator; + using Iterator = std::shared_ptr; + + struct FileCacheRecord + { + Key key; + size_t offset; + size_t size; + size_t hits = 0; + + FileCacheRecord(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) { } + }; + + /// It provides an iterator to traverse the cache priority. Under normal circumstances, + /// the iterator can only return the records that have been directly swapped out. + /// For example, in the LRU algorithm, it can traverse all records, but in the LRU-K, it + /// can only traverse the records in the low priority queue. + class IIterator + { + public: + virtual ~IIterator() = default; + + virtual void next() = 0; + + virtual bool vaild() const = 0; + + /// Mark a cache record as recently used, it will update the priority + /// of the cache record according to different cache algorithms. + virtual void use(std::lock_guard & cache_lock) = 0; + + /// Deletes an existing cached record. + virtual void remove(std::lock_guard & cache_lock) = 0; + + virtual Key & key() const = 0; + + virtual size_t offset() const = 0; + + virtual size_t size() const = 0; + + virtual size_t hits() const = 0; + + virtual Iterator getSnapshot() = 0; + + virtual void incrementSize(size_t size_increment, std::lock_guard & cache_lock) = 0; + }; + +public: + virtual ~IFileCachePriority() = default; + + /// Add a cache record that did not exist before, and throw a + /// logical exception if the cache block already exists. + virtual Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) = 0; + + /// Query whether a cache record exists. If it exists, return true. If not, return false. + virtual bool contains(const Key & key, size_t offset, std::lock_guard & cache_lock) = 0; + + virtual void removeAll(std::lock_guard & cache_lock) = 0; + + /// Returns an iterator pointing to the lowest priority cached record. + /// We can traverse all cached records through the iterator's next(). + virtual Iterator getNewIterator(std::lock_guard & cache_lock) = 0; + + virtual size_t getElementsNum(std::lock_guard & cache_lock) const = 0; + + size_t getCacheSize(std::lock_guard &) const { return cache_size; } + + virtual std::string toString(std::lock_guard & cache_lock) const = 0; + +protected: + size_t max_cache_size = 0; + size_t cache_size = 0; +}; +}; diff --git a/src/Common/LRUFileCache.cpp b/src/Common/LRUFileCache.cpp index 6306b6de059..817208c6c30 100644 --- a/src/Common/LRUFileCache.cpp +++ b/src/Common/LRUFileCache.cpp @@ -24,6 +24,8 @@ namespace ErrorCodes LRUFileCache::LRUFileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_) : IFileCache(cache_base_path_, cache_settings_) + , main_priority(std::make_shared()) + , stash_priority(std::make_shared()) , max_stash_element_size(cache_settings_.max_elements) , enable_cache_hits_threshold(cache_settings_.enable_cache_hits_threshold) , log(&Poco::Logger::get("LRUFileCache")) @@ -31,7 +33,7 @@ LRUFileCache::LRUFileCache(const String & cache_base_path_, const FileCacheSetti { } -void LRUFileCache::initialize() +void FileCache::initialize() { std::lock_guard cache_lock(mutex); if (!is_initialized) @@ -55,7 +57,7 @@ void LRUFileCache::initialize() } } -void LRUFileCache::useCell( +void FileCache::useCell( const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock) { auto file_segment = cell.file_segment; @@ -75,11 +77,11 @@ void LRUFileCache::useCell( if (cell.queue_iterator) { /// Move to the end of the queue. The iterator remains valid. - queue.moveToEnd(*cell.queue_iterator, cache_lock); + cell.queue_iterator->use(cache_lock); } } -LRUFileCache::FileSegmentCell * LRUFileCache::getCell( +FileCache::FileSegmentCell * FileCache::getCell( const Key & key, size_t offset, std::lock_guard & /* cache_lock */) { auto it = files.find(key); @@ -94,7 +96,7 @@ LRUFileCache::FileSegmentCell * LRUFileCache::getCell( return &cell_it->second; } -FileSegments LRUFileCache::getImpl( +FileSegments FileCache::getImpl( const Key & key, const FileSegment::Range & range, std::lock_guard & cache_lock) { /// Given range = [left, right] and non-overlapping ordered set of file segments, @@ -145,12 +147,8 @@ FileSegments LRUFileCache::getImpl( if (range.left <= prev_cell_range.right) { - /// segment{k-1} segment{k} /// [________] [_____ /// [___________ - /// ^ - /// range.left - useCell(prev_cell, result, cache_lock); } } @@ -204,7 +202,7 @@ FileSegments LRUFileCache::splitRangeIntoCells( return file_segments; } -void LRUFileCache::fillHolesWithEmptyFileSegments( +void FileCache::fillHolesWithEmptyFileSegments( FileSegments & file_segments, const Key & key, const FileSegment::Range & range, @@ -326,7 +324,7 @@ FileSegmentsHolder LRUFileCache::getOrSet(const Key & key, size_t offset, size_t return FileSegmentsHolder(std::move(file_segments)); } -FileSegmentsHolder LRUFileCache::get(const Key & key, size_t offset, size_t size) +FileSegmentsHolder FileCache::get(const Key & key, size_t offset, size_t size) { assertInitialized(); @@ -379,20 +377,19 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell( FileSegment::State result_state = state; if (state == FileSegment::State::EMPTY && enable_cache_hits_threshold) { - auto record = records.find({key, offset}); + auto record = stash_records.find({key, offset}); - if (record == records.end()) + if (record == stash_records.end()) { - auto queue_iter = stash_queue.add(key, offset, 0, cache_lock); - records.insert({{key, offset}, queue_iter}); + auto priority_iter = stash_priority->add(key, offset, 0, cache_lock); + stash_records.insert({{key, offset}, priority_iter}); - if (stash_queue.getElementsNum(cache_lock) > max_stash_element_size) + if (stash_priority->getElementsNum(cache_lock) > max_stash_element_size) { - auto remove_queue_iter = stash_queue.begin(); - records.erase({remove_queue_iter->key, remove_queue_iter->offset}); - stash_queue.remove(remove_queue_iter, cache_lock); + auto remove_priority_iter = stash_priority->getNewIterator(cache_lock); + stash_records.erase({remove_priority_iter->key(), remove_priority_iter->offset()}); + remove_priority_iter->remove(cache_lock); } - /// For segments that do not reach the download threshold, we do not download them, but directly read them result_state = FileSegment::State::SKIP_CACHE; } @@ -452,7 +449,7 @@ FileSegmentsHolder LRUFileCache::setDownloading( return FileSegmentsHolder(std::move(file_segments)); } -bool LRUFileCache::tryReserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) +bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) { auto query_context = enable_filesystem_query_cache_limit ? getCurrentQueryContext(cache_lock) : nullptr; if (!query_context) @@ -473,40 +470,40 @@ bool LRUFileCache::tryReserve(const Key & key, size_t offset, size_t size, std:: else { size_t removed_size = 0; - size_t queue_size = queue.getElementsNum(cache_lock); + size_t queue_size = main_priority->getElementsNum(cache_lock); auto * cell_for_reserve = getCell(key, offset, cache_lock); - std::vector ghost; + std::vector ghost; std::vector trash; std::vector to_evict; auto is_overflow = [&] { - return (max_size != 0 && queue.getTotalCacheSize(cache_lock) + size - removed_size > max_size) + return (max_size != 0 && main_priority->getCacheSize(cache_lock) + size - removed_size > max_size) || (max_element_size != 0 && queue_size > max_element_size) || (query_context->getCacheSize() + size - removed_size > query_context->getMaxCacheSize()); }; /// Select the cache from the LRU queue held by query for expulsion. - for (auto iter = query_context->queue().begin(); iter != query_context->queue().end(); iter++) + for (auto iter = query_context->getPriority()->getNewIterator(cache_lock); iter->vaild(); iter->next()) { if (!is_overflow()) break; - auto * cell = getCell(iter->key, iter->offset, cache_lock); + auto * cell = getCell(iter->key(), iter->offset(), cache_lock); if (!cell) { /// The cache corresponding to this record may be swapped out by /// other queries, so it has become invalid. - ghost.push_back(iter); - removed_size += iter->size; + ghost.push_back(iter->getSnapshot()); + removed_size += iter->size(); } else { size_t cell_size = cell->size(); - assert(iter->size == cell_size); + assert(iter->size() == cell_size); if (cell->releasable()) { @@ -548,7 +545,7 @@ bool LRUFileCache::tryReserve(const Key & key, size_t offset, size_t size, std:: } for (auto & iter : ghost) - query_context->remove(iter->key, iter->offset, iter->size, cache_lock); + query_context->remove(iter->key(), iter->offset(), iter->size(), cache_lock); if (is_overflow()) return false; @@ -557,9 +554,9 @@ bool LRUFileCache::tryReserve(const Key & key, size_t offset, size_t size, std:: { auto queue_iterator = cell_for_reserve->queue_iterator; if (queue_iterator) - queue.incrementSize(*queue_iterator, size, cache_lock); + queue_iterator->incrementSize(size, cache_lock); else - cell_for_reserve->queue_iterator = queue.add(key, offset, size, cache_lock); + cell_for_reserve->queue_iterator = main_priority->add(key, offset, size, cache_lock); } for (auto & cell : to_evict) @@ -573,11 +570,11 @@ bool LRUFileCache::tryReserve(const Key & key, size_t offset, size_t size, std:: } } -bool LRUFileCache::tryReserveForMainList( +bool FileCache::tryReserveForMainList( const Key & key, size_t offset, size_t size, QueryContextPtr query_context, std::lock_guard & cache_lock) { auto removed_size = 0; - size_t queue_size = queue.getElementsNum(cache_lock); + size_t queue_size = main_priority->getElementsNum(cache_lock); assert(queue_size <= max_element_size); /// Since space reservation is incremental, cache cell already exists if it's state is EMPTY. @@ -592,15 +589,18 @@ bool LRUFileCache::tryReserveForMainList( auto is_overflow = [&] { /// max_size == 0 means unlimited cache size, max_element_size means unlimited number of cache elements. - return (max_size != 0 && queue.getTotalCacheSize(cache_lock) + size - removed_size > max_size) + return (max_size != 0 && main_priority->getCacheSize(cache_lock) + size - removed_size > max_size) || (max_element_size != 0 && queue_size > max_element_size); }; std::vector to_evict; std::vector trash; - for (const auto & [entry_key, entry_offset, entry_size, _] : queue) + for (auto it = main_priority->getNewIterator(cache_lock); it->vaild(); it->next()) { + auto entry_key = it->key(); + auto entry_offset = it->offset(); + if (!is_overflow()) break; @@ -612,7 +612,7 @@ bool LRUFileCache::tryReserveForMainList( key.toString(), offset); size_t cell_size = cell->size(); - assert(entry_size == cell_size); + assert(it->size() == cell_size); /// It is guaranteed that cell is not removed from cache as long as /// pointer to corresponding file segment is hold by any other thread. @@ -671,9 +671,9 @@ bool LRUFileCache::tryReserveForMainList( /// If queue iterator already exists, we need to update the size after each space reservation. auto queue_iterator = cell_for_reserve->queue_iterator; if (queue_iterator) - queue.incrementSize(*queue_iterator, size, cache_lock); + queue_iterator->incrementSize(size, cache_lock); else - cell_for_reserve->queue_iterator = queue.add(key, offset, size, cache_lock); + cell_for_reserve->queue_iterator = main_priority->add(key, offset, size, cache_lock); } for (auto & cell : to_evict) @@ -682,7 +682,7 @@ bool LRUFileCache::tryReserveForMainList( remove_file_segment(file_segment); } - if (queue.getTotalCacheSize(cache_lock) > (1ull << 63)) + if (main_priority->getCacheSize(cache_lock) > (1ull << 63)) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache became inconsistent. There must be a bug"); if (query_context) @@ -751,10 +751,12 @@ void LRUFileCache::removeIfReleasable(bool remove_persistent_files) std::lock_guard cache_lock(mutex); - std::vector to_remove; - for (auto it = queue.begin(); it != queue.end();) + std::vector to_remove; + for (auto it = main_priority->getNewIterator(cache_lock); it->vaild(); it->next()) { - const auto & [key, offset, size, _] = *it++; + auto key = it->key(); + auto offset = it->offset(); + auto * cell = getCell(key, offset, cache_lock); if (!cell) throw Exception( @@ -776,6 +778,13 @@ void LRUFileCache::removeIfReleasable(bool remove_persistent_files) } } + for (auto & file_segment : to_remove) + { + std::lock_guard segment_lock(file_segment->mutex); + file_segment->detach(cache_lock, segment_lock); + remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock); + } + /// Remove all access information. records.clear(); stash_queue.removeAll(cache_lock); @@ -785,7 +794,7 @@ void LRUFileCache::removeIfReleasable(bool remove_persistent_files) #endif } -void LRUFileCache::remove( +void FileCache::remove( Key key, size_t offset, std::lock_guard & cache_lock, std::lock_guard & /* segment_lock */) { @@ -799,7 +808,7 @@ void LRUFileCache::remove( if (cell->queue_iterator) { - queue.remove(*cell->queue_iterator, cache_lock); + cell->queue_iterator->remove(cache_lock); } auto & offsets = files[key]; @@ -831,12 +840,12 @@ void LRUFileCache::remove( } } -void LRUFileCache::loadCacheInfoIntoMemory(std::lock_guard & cache_lock) +void FileCache::loadCacheInfoIntoMemory(std::lock_guard & cache_lock) { Key key; UInt64 offset = 0; size_t size = 0; - std::vector>> queue_entries; + std::vector>> queue_entries; /// cache_base_path / key_prefix / key / offset @@ -888,7 +897,7 @@ void LRUFileCache::loadCacheInfoIntoMemory(std::lock_guard & cache_l { auto * cell = addCell(key, offset, size, FileSegment::State::DOWNLOADED, is_persistent, cache_lock); if (cell) - queue_entries.emplace_back(*cell->queue_iterator, cell->file_segment); + queue_entries.emplace_back(cell->queue_iterator, cell->file_segment); } else { @@ -912,14 +921,14 @@ void LRUFileCache::loadCacheInfoIntoMemory(std::lock_guard & cache_l if (file_segment.expired()) continue; - queue.moveToEnd(it, cache_lock); + it->use(cache_lock); } #ifndef NDEBUG assertCacheCorrectness(cache_lock); #endif } -void LRUFileCache::reduceSizeToDownloaded( +void FileCache::reduceSizeToDownloaded( const Key & key, size_t offset, std::lock_guard & cache_lock, std::lock_guard & /* segment_lock */) { @@ -952,7 +961,7 @@ void LRUFileCache::reduceSizeToDownloaded( cell->file_segment = std::make_shared(offset, downloaded_size, key, this, FileSegment::State::DOWNLOADED); } -bool LRUFileCache::isLastFileSegmentHolder( +bool FileCache::isLastFileSegmentHolder( const Key & key, size_t offset, std::lock_guard & cache_lock, std::lock_guard & /* segment_lock */) { @@ -965,7 +974,7 @@ bool LRUFileCache::isLastFileSegmentHolder( return cell->file_segment.use_count() == 2; } -FileSegments LRUFileCache::getSnapshot() const +FileSegments FileCache::getSnapshot() const { std::lock_guard cache_lock(mutex); @@ -979,7 +988,7 @@ FileSegments LRUFileCache::getSnapshot() const return file_segments; } -std::vector LRUFileCache::tryGetCachePaths(const Key & key) +std::vector FileCache::tryGetCachePaths(const Key & key) { std::lock_guard cache_lock(mutex); @@ -996,42 +1005,42 @@ std::vector LRUFileCache::tryGetCachePaths(const Key & key) return cache_paths; } -size_t LRUFileCache::getUsedCacheSize() const +size_t FileCache::getUsedCacheSize() const { std::lock_guard cache_lock(mutex); return getUsedCacheSizeUnlocked(cache_lock); } -size_t LRUFileCache::getUsedCacheSizeUnlocked(std::lock_guard & cache_lock) const +size_t FileCache::getUsedCacheSizeUnlocked(std::lock_guard & cache_lock) const { - return queue.getTotalCacheSize(cache_lock); + return main_priority->getCacheSize(cache_lock); } -size_t LRUFileCache::getAvailableCacheSize() const +size_t FileCache::getAvailableCacheSize() const { std::lock_guard cache_lock(mutex); return getAvailableCacheSizeUnlocked(cache_lock); } -size_t LRUFileCache::getAvailableCacheSizeUnlocked(std::lock_guard & cache_lock) const +size_t FileCache::getAvailableCacheSizeUnlocked(std::lock_guard & cache_lock) const { return max_size - getUsedCacheSizeUnlocked(cache_lock); } -size_t LRUFileCache::getFileSegmentsNum() const +size_t FileCache::getFileSegmentsNum() const { std::lock_guard cache_lock(mutex); return getFileSegmentsNumUnlocked(cache_lock); } -size_t LRUFileCache::getFileSegmentsNumUnlocked(std::lock_guard & cache_lock) const +size_t FileCache::getFileSegmentsNumUnlocked(std::lock_guard & cache_lock) const { - return queue.getElementsNum(cache_lock); + return main_priority->getElementsNum(cache_lock); } -LRUFileCache::FileSegmentCell::FileSegmentCell( +FileCache::FileSegmentCell::FileSegmentCell( FileSegmentPtr file_segment_, - LRUFileCache * cache, + FileCache * cache, std::lock_guard & cache_lock) : file_segment(file_segment_) { @@ -1045,7 +1054,7 @@ LRUFileCache::FileSegmentCell::FileSegmentCell( { case FileSegment::State::DOWNLOADED: { - queue_iterator = cache->queue.add(file_segment->key(), file_segment->offset(), file_segment->range().size(), cache_lock); + queue_iterator = cache->main_priority->add(file_segment->key(), file_segment->offset(), file_segment->range().size(), cache_lock); break; } case FileSegment::State::SKIP_CACHE: @@ -1133,7 +1142,7 @@ String LRUFileCache::dumpStructure(const Key & key) return dumpStructureUnlocked(key, cache_lock); } -String LRUFileCache::dumpStructureUnlocked(const Key & key, std::lock_guard & cache_lock) +String FileCache::dumpStructureUnlocked(const Key & key, std::lock_guard & cache_lock) { WriteBufferFromOwnString result; const auto & cells_by_offset = files[key]; @@ -1141,11 +1150,11 @@ String LRUFileCache::dumpStructureUnlocked(const Key & key, std::lock_guardgetInfoForLog() << "\n"; - result << "\n\nQueue: " << queue.toString(cache_lock); + result << "\n\nPriority: " << main_priority->toString(cache_lock); return result.str(); } -void LRUFileCache::assertCacheCellsCorrectness( +void FileCache::assertCacheCellsCorrectness( const FileSegmentsByOffset & cells_by_offset, [[maybe_unused]] std::lock_guard & cache_lock) { for (const auto & [_, cell] : cells_by_offset) @@ -1156,30 +1165,32 @@ void LRUFileCache::assertCacheCellsCorrectness( if (file_segment->reserved_size != 0) { assert(cell.queue_iterator); - assert(queue.contains(file_segment->key(), file_segment->offset(), cache_lock)); + assert(priority.contains(file_segment->key(), file_segment->offset(), cache_lock)); } } } -void LRUFileCache::assertCacheCorrectness(const Key & key, std::lock_guard & cache_lock) +void FileCache::assertCacheCorrectness(const Key & key, std::lock_guard & cache_lock) { assertCacheCellsCorrectness(files[key], cache_lock); - assertQueueCorrectness(cache_lock); + assertPriorityCorrectness(cache_lock); } -void LRUFileCache::assertCacheCorrectness(std::lock_guard & cache_lock) +void FileCache::assertCacheCorrectness(std::lock_guard & cache_lock) { for (const auto & [key, cells_by_offset] : files) assertCacheCellsCorrectness(files[key], cache_lock); - assertQueueCorrectness(cache_lock); + assertPriorityCorrectness(cache_lock); } -void LRUFileCache::assertQueueCorrectness(std::lock_guard & cache_lock) +void FileCache::assertPriorityCorrectness(std::lock_guard & cache_lock) { [[maybe_unused]] size_t total_size = 0; - for (auto it = queue.begin(); it != queue.end();) + for (auto it = main_priority->getNewIterator(cache_lock); it->vaild(); it->next()) { - auto & [key, offset, size, _] = *it++; + auto key = it->key(); + auto offset = it->offset(); + auto size = it->size(); auto * cell = getCell(key, offset, cache_lock); if (!cell) @@ -1188,14 +1199,12 @@ void LRUFileCache::assertQueueCorrectness(std::lock_guard & cache_lo ErrorCodes::LOGICAL_ERROR, "Cache is in inconsistent state: LRU queue contains entries with no cache cell (assertCorrectness())"); } - assert(cell->size() == size); total_size += size; } - - assert(total_size == queue.getTotalCacheSize(cache_lock)); - assert(queue.getTotalCacheSize(cache_lock) <= max_size); - assert(queue.getElementsNum(cache_lock) <= max_element_size); + assert(total_size == main_priority->getCacheSize(cache_lock)); + assert(main_priority->getCacheSize(cache_lock) <= max_size); + assert(main_priority->getElementsNum(cache_lock) <= max_element_size); } } diff --git a/src/Common/tests/gtest_lru_file_cache.cpp b/src/Common/tests/gtest_lru_file_cache.cpp index 2f268e217df..8e7554f0418 100644 --- a/src/Common/tests/gtest_lru_file_cache.cpp +++ b/src/Common/tests/gtest_lru_file_cache.cpp @@ -85,7 +85,7 @@ void complete(const DB::FileSegmentsHolder & holder) } -TEST(LRUFileCache, get) +TEST(FileCache, get) { if (fs::exists(cache_base_path)) fs::remove_all(cache_base_path); @@ -103,7 +103,7 @@ TEST(LRUFileCache, get) DB::FileCacheSettings settings; settings.max_size = 30; settings.max_elements = 5; - auto cache = DB::LRUFileCache(cache_base_path, settings); + auto cache = DB::FileCache(cache_base_path, settings); cache.initialize(); auto key = cache.hash("key1"); @@ -479,7 +479,7 @@ TEST(LRUFileCache, get) { /// Test LRUCache::restore(). - auto cache2 = DB::LRUFileCache(cache_base_path, settings); + auto cache2 = DB::FileCache(cache_base_path, settings); cache2.initialize(); auto holder1 = cache2.getOrSet(key, 2, 28, false); /// Get [2, 29] @@ -499,7 +499,7 @@ TEST(LRUFileCache, get) auto settings2 = settings; settings2.max_file_segment_size = 10; - auto cache2 = DB::LRUFileCache(caches_dir / "cache2", settings2); + auto cache2 = DB::FileCache(caches_dir / "cache2", settings2); cache2.initialize(); auto holder1 = cache2.getOrSet(key, 0, 25, false); /// Get [0, 24] From ffaf44c1c1838fda438c4da374d2427e6576b0ff Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Tue, 14 Jun 2022 20:32:30 +0800 Subject: [PATCH 123/164] fix style --- src/Common/FileCache.h | 390 ++++++++++++++++++++++++++++++++ src/Common/IFileCachePriority.h | 2 +- src/Common/LRUFileCache.cpp | 8 +- 3 files changed, 395 insertions(+), 5 deletions(-) create mode 100644 src/Common/FileCache.h diff --git a/src/Common/FileCache.h b/src/Common/FileCache.h new file mode 100644 index 00000000000..13bca0e2dae --- /dev/null +++ b/src/Common/FileCache.h @@ -0,0 +1,390 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "FileCache_fwd.h" +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +class IFileCache; +using FileCachePtr = std::shared_ptr; + +/** + * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. + */ +class IFileCache : private boost::noncopyable +{ +friend class FileSegment; +friend struct FileSegmentsHolder; +friend class FileSegmentRangeWriter; + +public: + using Key = UInt128; + using Downloader = std::unique_ptr; + + IFileCache( + const String & cache_base_path_, + const FileCacheSettings & cache_settings_); + + virtual ~IFileCache() = default; + + /// Restore cache from local filesystem. + virtual void initialize() = 0; + + virtual void remove(const Key & key) = 0; + + virtual void remove() = 0; + + static bool isReadOnly(); + + /// Cache capacity in bytes. + size_t capacity() const { return max_size; } + + static Key hash(const String & path); + + String getPathInLocalCache(const Key & key, size_t offset); + + String getPathInLocalCache(const Key & key); + + const String & getBasePath() const { return cache_base_path; } + + virtual std::vector tryGetCachePaths(const Key & key) = 0; + + /** + * Given an `offset` and `size` representing [offset, offset + size) bytes interval, + * return list of cached non-overlapping non-empty + * file segments `[segment1, ..., segmentN]` which intersect with given interval. + * + * Segments in returned list are ordered in ascending order and represent a full contiguous + * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. + * + * As long as pointers to returned file segments are hold + * it is guaranteed that these file segments are not removed from cache. + */ + virtual FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size) = 0; + + /** + * Segments in returned list are ordered in ascending order and represent a full contiguous + * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. + * + * If file segment has state EMPTY, then it is also marked as "detached". E.g. it is "detached" + * from cache (not owned by cache), and as a result will never change it's state and will be destructed + * with the destruction of the holder, while in getOrSet() EMPTY file segments can eventually change + * it's state (and become DOWNLOADED). + */ + virtual FileSegmentsHolder get(const Key & key, size_t offset, size_t size) = 0; + + virtual FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size) = 0; + + virtual FileSegments getSnapshot() const = 0; + + /// For debug. + virtual String dumpStructure(const Key & key) = 0; + + virtual size_t getUsedCacheSize() const = 0; + + virtual size_t getFileSegmentsNum() const = 0; + +protected: + String cache_base_path; + size_t max_size; + size_t max_element_size; + size_t max_file_segment_size; + + bool is_initialized = false; + + mutable std::mutex mutex; + + virtual bool tryReserve( + const Key & key, size_t offset, size_t size, + std::lock_guard & cache_lock) = 0; + + virtual void remove( + Key key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & segment_lock) = 0; + + virtual bool isLastFileSegmentHolder( + const Key & key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & segment_lock) = 0; + + /// If file segment was partially downloaded and then space reservation fails (because of no + /// space left), then update corresponding cache cell metadata (file segment size). + virtual void reduceSizeToDownloaded( + const Key & key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & segment_lock) = 0; + + void assertInitialized() const; + +protected: + using KeyAndOffset = std::pair; + + struct KeyAndOffsetHash + { + std::size_t operator()(const KeyAndOffset & key) const + { + return std::hash()(key.first) ^ std::hash()(key.second); + } + }; + + using FileCacheRecords = std::unordered_map; + + /// Used to track and control the cache access of each query. + /// Through it, we can realize the processing of different queries by the cache layer. + struct QueryContext + { + FileCacheRecords records; + FileCachePriorityPtr priority; + + size_t cache_size = 0; + size_t max_cache_size; + + bool skip_download_if_exceeds_query_cache; + + QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_) + : priority(std::make_shared()) + , max_cache_size(max_cache_size_) + , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {} + + void remove(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) + { + if (cache_size < size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size"); + + if (!skip_download_if_exceeds_query_cache) + { + auto record = records.find({key, offset}); + if (record != records.end()) + { + record->second->remove(cache_lock); + records.erase({key, offset}); + } + } + cache_size -= size; + } + + void reserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) + { + if (cache_size + size > max_cache_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Reserved cache size exceeds the remaining cache size"); + + if (!skip_download_if_exceeds_query_cache) + { + auto record = records.find({key, offset}); + if (record == records.end()) + { + auto queue_iter = priority->add(key, offset, 0, cache_lock); + record = records.insert({{key, offset}, queue_iter}).first; + } + record->second->incrementSize(size, cache_lock); + } + cache_size += size; + } + + void use(const Key & key, size_t offset, std::lock_guard & cache_lock) + { + if (!skip_download_if_exceeds_query_cache) + { + auto record = records.find({key, offset}); + if (record != records.end()) + record->second->use(cache_lock); + } + } + + size_t getMaxCacheSize() { return max_cache_size; } + + size_t getCacheSize() { return cache_size; } + + FileCachePriorityPtr getPriority() { return priority; } + + bool isSkipDownloadIfExceed() { return skip_download_if_exceeds_query_cache; } + }; + + using QueryContextPtr = std::shared_ptr; + using QueryContextMap = std::unordered_map; + + QueryContextMap query_map; + + bool enable_filesystem_query_cache_limit; + + QueryContextPtr getCurrentQueryContext(std::lock_guard & cache_lock); + + QueryContextPtr getQueryContext(const String & query_id, std::lock_guard & cache_lock); + + void removeQueryContext(const String & query_id); + + QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard &); + +public: + /// Save a query context information, and adopt different cache policies + /// for different queries through the context cache layer. + struct QueryContextHolder : private boost::noncopyable + { + explicit QueryContextHolder(const String & query_id_, IFileCache * cache_, QueryContextPtr context_); + + QueryContextHolder() = default; + + ~QueryContextHolder(); + + String query_id {}; + IFileCache * cache = nullptr; + QueryContextPtr context = nullptr; + }; + + QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings); +}; + +class FileCache final : public IFileCache +{ +public: + FileCache( + const String & cache_base_path_, + const FileCacheSettings & cache_settings_); + + FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size) override; + + FileSegmentsHolder get(const Key & key, size_t offset, size_t size) override; + + FileSegments getSnapshot() const override; + + void initialize() override; + + void remove(const Key & key) override; + + void remove() override; + + std::vector tryGetCachePaths(const Key & key) override; + + size_t getUsedCacheSize() const override; + + size_t getFileSegmentsNum() const override; + +private: + struct FileSegmentCell : private boost::noncopyable + { + FileSegmentPtr file_segment; + + /// Iterator is put here on first reservation attempt, if successful. + IFileCachePriority::Iterator queue_iterator; + + /// Pointer to file segment is always hold by the cache itself. + /// Apart from pointer in cache, it can be hold by cache users, when they call + /// getorSet(), but cache users always hold it via FileSegmentsHolder. + bool releasable() const { return file_segment.unique(); } + + size_t size() const { return file_segment->reserved_size; } + + FileSegmentCell(FileSegmentPtr file_segment_, FileCache * cache, std::lock_guard & cache_lock); + + FileSegmentCell(FileSegmentCell && other) noexcept + : file_segment(std::move(other.file_segment)) + , queue_iterator(other.queue_iterator) {} + }; + + using FileSegmentsByOffset = std::map; + using CachedFiles = std::unordered_map; + + CachedFiles files; + FileCachePriorityPtr main_priority; + + FileCacheRecords stash_records; + FileCachePriorityPtr stash_priority; + + size_t max_stash_element_size; + size_t enable_cache_hits_threshold; + + Poco::Logger * log; + + FileSegments getImpl( + const Key & key, const FileSegment::Range & range, + std::lock_guard & cache_lock); + + FileSegmentCell * getCell( + const Key & key, size_t offset, std::lock_guard & cache_lock); + + FileSegmentCell * addCell( + const Key & key, size_t offset, size_t size, + FileSegment::State state, std::lock_guard & cache_lock); + + void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock); + + bool tryReserve( + const Key & key, size_t offset, size_t size, + std::lock_guard & cache_lock) override; + + bool tryReserveForMainList( + const Key & key, size_t offset, size_t size, + QueryContextPtr query_context, + std::lock_guard & cache_lock); + + void remove( + Key key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & segment_lock) override; + + bool isLastFileSegmentHolder( + const Key & key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & segment_lock) override; + + void reduceSizeToDownloaded( + const Key & key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & segment_lock) override; + + size_t getAvailableCacheSize() const; + + void loadCacheInfoIntoMemory(std::lock_guard & cache_lock); + + FileSegments splitRangeIntoCells( + const Key & key, size_t offset, size_t size, FileSegment::State state, std::lock_guard & cache_lock); + + String dumpStructureUnlocked(const Key & key_, std::lock_guard & cache_lock); + + void fillHolesWithEmptyFileSegments( + FileSegments & file_segments, const Key & key, const FileSegment::Range & range, bool fill_with_detached_file_segments, std::lock_guard & cache_lock); + + FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size) override; + + size_t getUsedCacheSizeUnlocked(std::lock_guard & cache_lock) const; + + size_t getAvailableCacheSizeUnlocked(std::lock_guard & cache_lock) const; + + size_t getFileSegmentsNumUnlocked(std::lock_guard & cache_lock) const; + + void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard & cache_lock); + +public: + String dumpStructure(const Key & key_) override; + + void assertCacheCorrectness(const Key & key, std::lock_guard & cache_lock); + + void assertCacheCorrectness(std::lock_guard & cache_lock); + + void assertPriorityCorrectness(std::lock_guard & cache_lock); +}; + +} diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index 35b82d61228..a5186bbeea8 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -41,7 +41,7 @@ public: virtual void next() = 0; - virtual bool vaild() const = 0; + virtual bool valid() const = 0; /// Mark a cache record as recently used, it will update the priority /// of the cache record according to different cache algorithms. diff --git a/src/Common/LRUFileCache.cpp b/src/Common/LRUFileCache.cpp index 817208c6c30..54b07a81afe 100644 --- a/src/Common/LRUFileCache.cpp +++ b/src/Common/LRUFileCache.cpp @@ -486,7 +486,7 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc }; /// Select the cache from the LRU queue held by query for expulsion. - for (auto iter = query_context->getPriority()->getNewIterator(cache_lock); iter->vaild(); iter->next()) + for (auto iter = query_context->getPriority()->getNewIterator(cache_lock); iter->valid(); iter->next()) { if (!is_overflow()) break; @@ -596,7 +596,7 @@ bool FileCache::tryReserveForMainList( std::vector to_evict; std::vector trash; - for (auto it = main_priority->getNewIterator(cache_lock); it->vaild(); it->next()) + for (auto it = main_priority->getNewIterator(cache_lock); it->valid(); it->next()) { auto entry_key = it->key(); auto entry_offset = it->offset(); @@ -752,7 +752,7 @@ void LRUFileCache::removeIfReleasable(bool remove_persistent_files) std::lock_guard cache_lock(mutex); std::vector to_remove; - for (auto it = main_priority->getNewIterator(cache_lock); it->vaild(); it->next()) + for (auto it = main_priority->getNewIterator(cache_lock); it->valid(); it->next()) { auto key = it->key(); auto offset = it->offset(); @@ -1186,7 +1186,7 @@ void FileCache::assertCacheCorrectness(std::lock_guard & cache_lock) void FileCache::assertPriorityCorrectness(std::lock_guard & cache_lock) { [[maybe_unused]] size_t total_size = 0; - for (auto it = main_priority->getNewIterator(cache_lock); it->vaild(); it->next()) + for (auto it = main_priority->getNewIterator(cache_lock); it->valid(); it->next()) { auto key = it->key(); auto offset = it->offset(); From 43cf7716574c5d4c99a16433d18143928a480d25 Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Tue, 14 Jun 2022 21:17:52 +0800 Subject: [PATCH 124/164] better --- src/Common/IFileCachePriority.h | 30 ++++++++++++++++++++++-------- src/Common/LRUFileCache.cpp | 2 +- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index a5186bbeea8..677ccd76934 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -7,6 +7,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + class IFileCachePriority; using FileCachePriorityPtr = std::shared_ptr; @@ -39,16 +44,29 @@ public: public: virtual ~IIterator() = default; - virtual void next() = 0; + virtual void next() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support next() for IIterator."); } - virtual bool valid() const = 0; + virtual bool valid() const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support valid() for IIterator."); } /// Mark a cache record as recently used, it will update the priority /// of the cache record according to different cache algorithms. - virtual void use(std::lock_guard & cache_lock) = 0; + virtual void use(std::lock_guard &) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support use() for IIterator."); + } /// Deletes an existing cached record. - virtual void remove(std::lock_guard & cache_lock) = 0; + virtual void remove(std::lock_guard &) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support remove() for IIterator."); + } + + virtual Iterator getSnapshot() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support getSnapshot() for IIterator."); } + + virtual void incrementSize(size_t, std::lock_guard &) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support incrementSize() for IIterator."); + } virtual Key & key() const = 0; @@ -57,10 +75,6 @@ public: virtual size_t size() const = 0; virtual size_t hits() const = 0; - - virtual Iterator getSnapshot() = 0; - - virtual void incrementSize(size_t size_increment, std::lock_guard & cache_lock) = 0; }; public: diff --git a/src/Common/LRUFileCache.cpp b/src/Common/LRUFileCache.cpp index 54b07a81afe..1a9924ba332 100644 --- a/src/Common/LRUFileCache.cpp +++ b/src/Common/LRUFileCache.cpp @@ -1165,7 +1165,7 @@ void FileCache::assertCacheCellsCorrectness( if (file_segment->reserved_size != 0) { assert(cell.queue_iterator); - assert(priority.contains(file_segment->key(), file_segment->offset(), cache_lock)); + assert(main_priority->contains(file_segment->key(), file_segment->offset(), cache_lock)); } } } From c5f90225103b521b8c2dafb68c50813657d6c65f Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Sun, 26 Jun 2022 03:05:54 +0800 Subject: [PATCH 125/164] fix --- .../{LRUFileCache.cpp => FileCache.cpp} | 99 ++----- src/Common/FileCache.h | 267 ++---------------- src/Common/FileCacheFactory.cpp | 4 +- src/Common/IFileCache.cpp | 8 +- src/Common/IFileCache.h | 63 +---- src/Common/IFileCachePriority.h | 14 +- src/Common/LRUFileCache.h | 202 +++++-------- src/Common/tests/gtest_lru_file_cache.cpp | 2 +- 8 files changed, 131 insertions(+), 528 deletions(-) rename src/Common/{LRUFileCache.cpp => FileCache.cpp} (92%) diff --git a/src/Common/LRUFileCache.cpp b/src/Common/FileCache.cpp similarity index 92% rename from src/Common/LRUFileCache.cpp rename to src/Common/FileCache.cpp index 1a9924ba332..56eb4c9e081 100644 --- a/src/Common/LRUFileCache.cpp +++ b/src/Common/FileCache.cpp @@ -1,4 +1,4 @@ -#include "LRUFileCache.h" +#include "FileCache.h" #include #include @@ -11,6 +11,7 @@ #include #include #include +#include namespace fs = std::filesystem; @@ -22,13 +23,13 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -LRUFileCache::LRUFileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_) +FileCache::FileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_) : IFileCache(cache_base_path_, cache_settings_) , main_priority(std::make_shared()) , stash_priority(std::make_shared()) , max_stash_element_size(cache_settings_.max_elements) , enable_cache_hits_threshold(cache_settings_.enable_cache_hits_threshold) - , log(&Poco::Logger::get("LRUFileCache")) + , log(&Poco::Logger::get("FileCache")) , allow_to_remove_persistent_segments_from_cache_by_default(cache_settings_.allow_to_remove_persistent_segments_from_cache_by_default) { } @@ -173,7 +174,7 @@ FileSegments FileCache::getImpl( return result; } -FileSegments LRUFileCache::splitRangeIntoCells( +FileSegments FileCache::splitRangeIntoCells( const Key & key, size_t offset, size_t size, FileSegment::State state, bool is_persistent, std::lock_guard & cache_lock) { assert(size > 0); @@ -296,7 +297,7 @@ void FileCache::fillHolesWithEmptyFileSegments( } } -FileSegmentsHolder LRUFileCache::getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) +FileSegmentsHolder FileCache::getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) { assertInitialized(); @@ -356,7 +357,7 @@ FileSegmentsHolder FileCache::get(const Key & key, size_t offset, size_t size) return FileSegmentsHolder(std::move(file_segments)); } -LRUFileCache::FileSegmentCell * LRUFileCache::addCell( +FileCache::FileSegmentCell * FileCache::addCell( const Key & key, size_t offset, size_t size, FileSegment::State state, bool is_persistent, std::lock_guard & cache_lock) @@ -395,11 +396,9 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell( } else { - auto queue_iter = record->second; - queue_iter->hits++; - stash_queue.moveToEnd(queue_iter, cache_lock); - - result_state = queue_iter->hits >= enable_cache_hits_threshold ? FileSegment::State::EMPTY : FileSegment::State::SKIP_CACHE; + auto priority_iter = record->second; + priority_iter->use(cache_lock); + result_state = priority_iter->hits() >= enable_cache_hits_threshold ? FileSegment::State::EMPTY : FileSegment::State::SKIP_CACHE; } } @@ -426,7 +425,7 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell( return &(it->second); } -FileSegmentsHolder LRUFileCache::setDownloading( +FileSegmentsHolder FileCache::setDownloading( const Key & key, size_t offset, size_t size, @@ -691,7 +690,7 @@ bool FileCache::tryReserveForMainList( return true; } -void LRUFileCache::removeIfExists(const Key & key) +void FileCache::removeIfExists(const Key & key) { assertInitialized(); @@ -742,7 +741,7 @@ void LRUFileCache::removeIfExists(const Key & key) } } -void LRUFileCache::removeIfReleasable(bool remove_persistent_files) +void FileCache::removeIfReleasable(bool remove_persistent_files) { /// Try remove all cached files by cache_base_path. /// Only releasable file segments are evicted. @@ -786,8 +785,8 @@ void LRUFileCache::removeIfReleasable(bool remove_persistent_files) } /// Remove all access information. - records.clear(); - stash_queue.removeAll(cache_lock); + stash_records.clear(); + stash_priority->removeAll(cache_lock); #ifndef NDEBUG assertCacheCorrectness(cache_lock); @@ -1070,73 +1069,7 @@ FileCache::FileSegmentCell::FileSegmentCell( } } -IFileCache::LRUQueue::Iterator IFileCache::LRUQueue::add( - const IFileCache::Key & key, size_t offset, size_t size, std::lock_guard & /* cache_lock */) -{ -#ifndef NDEBUG - for (const auto & [entry_key, entry_offset, entry_size, entry_hits] : queue) - { - if (entry_key == key && entry_offset == offset) - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Attempt to add duplicate queue entry to queue. (Key: {}, offset: {}, size: {})", - key.toString(), offset, size); - } -#endif - - cache_size += size; - return queue.insert(queue.end(), FileKeyAndOffset(key, offset, size)); -} - -void IFileCache::LRUQueue::remove(Iterator queue_it, std::lock_guard & /* cache_lock */) -{ - cache_size -= queue_it->size; - queue.erase(queue_it); -} - -void IFileCache::LRUQueue::removeAll(std::lock_guard & /* cache_lock */) -{ - queue.clear(); - cache_size = 0; -} - -void IFileCache::LRUQueue::moveToEnd(Iterator queue_it, std::lock_guard & /* cache_lock */) -{ - queue.splice(queue.end(), queue, queue_it); -} - -void IFileCache::LRUQueue::incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard & /* cache_lock */) -{ - cache_size += size_increment; - queue_it->size += size_increment; -} - -bool IFileCache::LRUQueue::contains( - const IFileCache::Key & key, size_t offset, std::lock_guard & /* cache_lock */) const -{ - /// This method is used for assertions in debug mode. - /// So we do not care about complexity here. - for (const auto & [entry_key, entry_offset, size, _] : queue) - { - if (key == entry_key && offset == entry_offset) - return true; - } - return false; -} - -String IFileCache::LRUQueue::toString(std::lock_guard & /* cache_lock */) const -{ - String result; - for (const auto & [key, offset, size, _] : queue) - { - if (!result.empty()) - result += ", "; - result += fmt::format("{}: [{}, {}]", key.toString(), offset, offset + size - 1); - } - return result; -} - -String LRUFileCache::dumpStructure(const Key & key) +String FileCache::dumpStructure(const Key & key) { std::lock_guard cache_lock(mutex); return dumpStructureUnlocked(key, cache_lock); diff --git a/src/Common/FileCache.h b/src/Common/FileCache.h index 13bca0e2dae..5aa32ac94ca 100644 --- a/src/Common/FileCache.h +++ b/src/Common/FileCache.h @@ -11,252 +11,18 @@ #include #include -#include "FileCache_fwd.h" -#include #include #include -#include -#include -#include +#include + namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -class IFileCache; -using FileCachePtr = std::shared_ptr; - /** * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. + * Implements LRU eviction policy. */ -class IFileCache : private boost::noncopyable -{ -friend class FileSegment; -friend struct FileSegmentsHolder; -friend class FileSegmentRangeWriter; - -public: - using Key = UInt128; - using Downloader = std::unique_ptr; - - IFileCache( - const String & cache_base_path_, - const FileCacheSettings & cache_settings_); - - virtual ~IFileCache() = default; - - /// Restore cache from local filesystem. - virtual void initialize() = 0; - - virtual void remove(const Key & key) = 0; - - virtual void remove() = 0; - - static bool isReadOnly(); - - /// Cache capacity in bytes. - size_t capacity() const { return max_size; } - - static Key hash(const String & path); - - String getPathInLocalCache(const Key & key, size_t offset); - - String getPathInLocalCache(const Key & key); - - const String & getBasePath() const { return cache_base_path; } - - virtual std::vector tryGetCachePaths(const Key & key) = 0; - - /** - * Given an `offset` and `size` representing [offset, offset + size) bytes interval, - * return list of cached non-overlapping non-empty - * file segments `[segment1, ..., segmentN]` which intersect with given interval. - * - * Segments in returned list are ordered in ascending order and represent a full contiguous - * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. - * - * As long as pointers to returned file segments are hold - * it is guaranteed that these file segments are not removed from cache. - */ - virtual FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size) = 0; - - /** - * Segments in returned list are ordered in ascending order and represent a full contiguous - * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. - * - * If file segment has state EMPTY, then it is also marked as "detached". E.g. it is "detached" - * from cache (not owned by cache), and as a result will never change it's state and will be destructed - * with the destruction of the holder, while in getOrSet() EMPTY file segments can eventually change - * it's state (and become DOWNLOADED). - */ - virtual FileSegmentsHolder get(const Key & key, size_t offset, size_t size) = 0; - - virtual FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size) = 0; - - virtual FileSegments getSnapshot() const = 0; - - /// For debug. - virtual String dumpStructure(const Key & key) = 0; - - virtual size_t getUsedCacheSize() const = 0; - - virtual size_t getFileSegmentsNum() const = 0; - -protected: - String cache_base_path; - size_t max_size; - size_t max_element_size; - size_t max_file_segment_size; - - bool is_initialized = false; - - mutable std::mutex mutex; - - virtual bool tryReserve( - const Key & key, size_t offset, size_t size, - std::lock_guard & cache_lock) = 0; - - virtual void remove( - Key key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) = 0; - - virtual bool isLastFileSegmentHolder( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) = 0; - - /// If file segment was partially downloaded and then space reservation fails (because of no - /// space left), then update corresponding cache cell metadata (file segment size). - virtual void reduceSizeToDownloaded( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) = 0; - - void assertInitialized() const; - -protected: - using KeyAndOffset = std::pair; - - struct KeyAndOffsetHash - { - std::size_t operator()(const KeyAndOffset & key) const - { - return std::hash()(key.first) ^ std::hash()(key.second); - } - }; - - using FileCacheRecords = std::unordered_map; - - /// Used to track and control the cache access of each query. - /// Through it, we can realize the processing of different queries by the cache layer. - struct QueryContext - { - FileCacheRecords records; - FileCachePriorityPtr priority; - - size_t cache_size = 0; - size_t max_cache_size; - - bool skip_download_if_exceeds_query_cache; - - QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_) - : priority(std::make_shared()) - , max_cache_size(max_cache_size_) - , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {} - - void remove(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) - { - if (cache_size < size) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size"); - - if (!skip_download_if_exceeds_query_cache) - { - auto record = records.find({key, offset}); - if (record != records.end()) - { - record->second->remove(cache_lock); - records.erase({key, offset}); - } - } - cache_size -= size; - } - - void reserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) - { - if (cache_size + size > max_cache_size) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Reserved cache size exceeds the remaining cache size"); - - if (!skip_download_if_exceeds_query_cache) - { - auto record = records.find({key, offset}); - if (record == records.end()) - { - auto queue_iter = priority->add(key, offset, 0, cache_lock); - record = records.insert({{key, offset}, queue_iter}).first; - } - record->second->incrementSize(size, cache_lock); - } - cache_size += size; - } - - void use(const Key & key, size_t offset, std::lock_guard & cache_lock) - { - if (!skip_download_if_exceeds_query_cache) - { - auto record = records.find({key, offset}); - if (record != records.end()) - record->second->use(cache_lock); - } - } - - size_t getMaxCacheSize() { return max_cache_size; } - - size_t getCacheSize() { return cache_size; } - - FileCachePriorityPtr getPriority() { return priority; } - - bool isSkipDownloadIfExceed() { return skip_download_if_exceeds_query_cache; } - }; - - using QueryContextPtr = std::shared_ptr; - using QueryContextMap = std::unordered_map; - - QueryContextMap query_map; - - bool enable_filesystem_query_cache_limit; - - QueryContextPtr getCurrentQueryContext(std::lock_guard & cache_lock); - - QueryContextPtr getQueryContext(const String & query_id, std::lock_guard & cache_lock); - - void removeQueryContext(const String & query_id); - - QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard &); - -public: - /// Save a query context information, and adopt different cache policies - /// for different queries through the context cache layer. - struct QueryContextHolder : private boost::noncopyable - { - explicit QueryContextHolder(const String & query_id_, IFileCache * cache_, QueryContextPtr context_); - - QueryContextHolder() = default; - - ~QueryContextHolder(); - - String query_id {}; - IFileCache * cache = nullptr; - QueryContextPtr context = nullptr; - }; - - QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings); -}; - class FileCache final : public IFileCache { public: @@ -264,7 +30,7 @@ public: const String & cache_base_path_, const FileCacheSettings & cache_settings_); - FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size) override; + FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) override; FileSegmentsHolder get(const Key & key, size_t offset, size_t size) override; @@ -272,9 +38,9 @@ public: void initialize() override; - void remove(const Key & key) override; + void removeIfExists(const Key & key) override; - void remove() override; + void removeIfReleasable(bool remove_persistent_files) override; std::vector tryGetCachePaths(const Key & key) override; @@ -293,7 +59,7 @@ private: /// Pointer to file segment is always hold by the cache itself. /// Apart from pointer in cache, it can be hold by cache users, when they call /// getorSet(), but cache users always hold it via FileSegmentsHolder. - bool releasable() const { return file_segment.unique(); } + bool releasable() const {return file_segment.unique(); } size_t size() const { return file_segment->reserved_size; } @@ -317,6 +83,7 @@ private: size_t enable_cache_hits_threshold; Poco::Logger * log; + bool allow_to_remove_persistent_segments_from_cache_by_default; FileSegments getImpl( const Key & key, const FileSegment::Range & range, @@ -327,7 +94,8 @@ private: FileSegmentCell * addCell( const Key & key, size_t offset, size_t size, - FileSegment::State state, std::lock_guard & cache_lock); + FileSegment::State state, bool is_persistent, + std::lock_guard & cache_lock); void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock); @@ -350,24 +118,19 @@ private: std::lock_guard & cache_lock, std::lock_guard & segment_lock) override; - void reduceSizeToDownloaded( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) override; - size_t getAvailableCacheSize() const; void loadCacheInfoIntoMemory(std::lock_guard & cache_lock); FileSegments splitRangeIntoCells( - const Key & key, size_t offset, size_t size, FileSegment::State state, std::lock_guard & cache_lock); + const Key & key, size_t offset, size_t size, FileSegment::State state, bool is_persistent, std::lock_guard & cache_lock); String dumpStructureUnlocked(const Key & key_, std::lock_guard & cache_lock); void fillHolesWithEmptyFileSegments( - FileSegments & file_segments, const Key & key, const FileSegment::Range & range, bool fill_with_detached_file_segments, std::lock_guard & cache_lock); + FileSegments & file_segments, const Key & key, const FileSegment::Range & range, bool fill_with_detached_file_segments, bool is_persistent, std::lock_guard & cache_lock); - FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size) override; + FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent) override; size_t getUsedCacheSizeUnlocked(std::lock_guard & cache_lock) const; @@ -377,6 +140,10 @@ private: void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard & cache_lock); + void reduceSizeToDownloaded( + const Key & key, size_t offset, + std::lock_guard & cache_lock, std::lock_guard & /* segment_lock */) override; + public: String dumpStructure(const Key & key_) override; diff --git a/src/Common/FileCacheFactory.cpp b/src/Common/FileCacheFactory.cpp index 259c1d3f48e..b276760c0dd 100644 --- a/src/Common/FileCacheFactory.cpp +++ b/src/Common/FileCacheFactory.cpp @@ -1,5 +1,5 @@ #include "FileCacheFactory.h" -#include "LRUFileCache.h" +#include "FileCache.h" namespace DB { @@ -53,7 +53,7 @@ FileCachePtr FileCacheFactory::getOrCreate( return it->second->cache; } - auto cache = std::make_shared(cache_base_path, file_cache_settings); + auto cache = std::make_shared(cache_base_path, file_cache_settings); FileCacheData result{cache, file_cache_settings}; auto cache_it = caches.insert(caches.end(), std::move(result)); diff --git a/src/Common/IFileCache.cpp b/src/Common/IFileCache.cpp index 8fe434dd740..e3ed82d7b62 100644 --- a/src/Common/IFileCache.cpp +++ b/src/Common/IFileCache.cpp @@ -140,7 +140,7 @@ void IFileCache::QueryContext::remove(const Key & key, size_t offset, size_t siz auto record = records.find({key, offset}); if (record != records.end()) { - lru_queue.remove(record->second, cache_lock); + record->second->remove(cache_lock); records.erase({key, offset}); } } @@ -162,10 +162,10 @@ void IFileCache::QueryContext::reserve(const Key & key, size_t offset, size_t si auto record = records.find({key, offset}); if (record == records.end()) { - auto queue_iter = lru_queue.add(key, offset, 0, cache_lock); + auto queue_iter = priority->add(key, offset, 0, cache_lock); record = records.insert({{key, offset}, queue_iter}).first; } - record->second->size += size; + record->second->incrementSize(size, cache_lock); } cache_size += size; } @@ -177,7 +177,7 @@ void IFileCache::QueryContext::use(const Key & key, size_t offset, std::lock_gua auto record = records.find({key, offset}); if (record != records.end()) - lru_queue.moveToEnd(record->second, cache_lock); + record->second->use(cache_lock); } IFileCache::QueryContextHolder::QueryContextHolder( diff --git a/src/Common/IFileCache.h b/src/Common/IFileCache.h index c820d18cb95..f46a83d52cf 100644 --- a/src/Common/IFileCache.h +++ b/src/Common/IFileCache.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -28,16 +29,7 @@ friend struct FileSegmentsHolder; friend class FileSegmentRangeWriter; public: - struct Key - { - UInt128 key; - String toString() const; - - Key() = default; - explicit Key(const UInt128 & key_) : key(key_) {} - - bool operator==(const Key & other) const { return key == other.key; } - }; + using Key = IFileCachePriority::Key; IFileCache( const String & cache_base_path_, @@ -133,49 +125,6 @@ protected: void assertInitialized() const; - class LRUQueue - { - public: - struct FileKeyAndOffset - { - Key key; - size_t offset; - size_t size; - size_t hits = 0; - - FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {} - }; - - using Iterator = typename std::list::iterator; - - size_t getTotalCacheSize(std::lock_guard & /* cache_lock */) const { return cache_size; } - - size_t getElementsNum(std::lock_guard & /* cache_lock */) const { return queue.size(); } - - Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock); - - void remove(Iterator queue_it, std::lock_guard & cache_lock); - - void moveToEnd(Iterator queue_it, std::lock_guard & cache_lock); - - /// Space reservation for a file segment is incremental, so we need to be able to increment size of the queue entry. - void incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard & cache_lock); - - String toString(std::lock_guard & cache_lock) const; - - bool contains(const Key & key, size_t offset, std::lock_guard & cache_lock) const; - - Iterator begin() { return queue.begin(); } - - Iterator end() { return queue.end(); } - - void removeAll(std::lock_guard & cache_lock); - - private: - std::list queue; - size_t cache_size = 0; - }; - using AccessKeyAndOffset = std::pair; struct KeyAndOffsetHash { @@ -185,14 +134,14 @@ protected: } }; - using AccessRecord = std::unordered_map; + using FileCacheRecords = std::unordered_map; /// Used to track and control the cache access of each query. /// Through it, we can realize the processing of different queries by the cache layer. struct QueryContext { - LRUQueue lru_queue; - AccessRecord records; + FileCacheRecords records; + FileCachePriorityPtr priority; size_t cache_size = 0; size_t max_cache_size; @@ -213,7 +162,7 @@ protected: size_t getCacheSize() const { return cache_size; } - LRUQueue & queue() { return lru_queue; } + FileCachePriorityPtr getPriority() { return priority; } bool isSkipDownloadIfExceed() const { return skip_download_if_exceeds_query_cache; } }; diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index 677ccd76934..a29d66c70be 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -3,6 +3,9 @@ #include #include #include +#include +#include +#include namespace DB { @@ -19,7 +22,16 @@ using FileCachePriorityPtr = std::shared_ptr; class IFileCachePriority { public: - using Key = UInt128; + struct Key + { + UInt128 key; + String toString() const; + + Key() = default; + explicit Key(const UInt128 & key_) : key(key_) {} + + bool operator==(const Key & other) const { return key == other.key; } + }; class IIterator; friend class IIterator; diff --git a/src/Common/LRUFileCache.h b/src/Common/LRUFileCache.h index 059fc0c22c9..0bd87b2b38c 100644 --- a/src/Common/LRUFileCache.h +++ b/src/Common/LRUFileCache.h @@ -1,157 +1,99 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - +#include namespace DB { -/** - * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. - * Implements LRU eviction policy. - */ -class LRUFileCache final : public IFileCache +class LRUFileCache : public IFileCachePriority { public: - LRUFileCache( - const String & cache_base_path_, - const FileCacheSettings & cache_settings_); + using LRUQueue = std::list; + using LRUQueueIterator = typename LRUQueue::iterator; + class WriteableIterator; - FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) override; - - FileSegmentsHolder get(const Key & key, size_t offset, size_t size) override; - - FileSegments getSnapshot() const override; - - void initialize() override; - - void removeIfExists(const Key & key) override; - - void removeIfReleasable(bool remove_persistent_files) override; - - std::vector tryGetCachePaths(const Key & key) override; - - size_t getUsedCacheSize() const override; - - size_t getFileSegmentsNum() const override; - -private: - struct FileSegmentCell : private boost::noncopyable + class ReadableIterator : public IIterator { - FileSegmentPtr file_segment; + public: + ReadableIterator(LRUFileCache * file_cache_, LRUQueueIterator queue_iter_) : file_cache(file_cache_), queue_iter(queue_iter_) { } - /// Iterator is put here on first reservation attempt, if successful. - std::optional queue_iterator; + void next() override { queue_iter++; } - /// Pointer to file segment is always hold by the cache itself. - /// Apart from pointer in cache, it can be hold by cache users, when they call - /// getorSet(), but cache users always hold it via FileSegmentsHolder. - bool releasable() const {return file_segment.unique(); } + bool valid() const override { return queue_iter != file_cache->queue.end(); } - size_t size() const { return file_segment->reserved_size; } + Key & key() const override { return queue_iter->key; } - FileSegmentCell(FileSegmentPtr file_segment_, LRUFileCache * cache, std::lock_guard & cache_lock); + size_t offset() const override { return queue_iter->offset; } - FileSegmentCell(FileSegmentCell && other) noexcept - : file_segment(std::move(other.file_segment)) - , queue_iterator(other.queue_iterator) {} + size_t size() const override { return queue_iter->size; } + + size_t hits() const override { return queue_iter->hits; } + + Iterator getSnapshot() override { return std::make_shared(file_cache, queue_iter); } + + protected: + LRUFileCache * file_cache; + LRUQueueIterator queue_iter; }; - using FileSegmentsByOffset = std::map; - using CachedFiles = std::unordered_map; + class WriteableIterator : public ReadableIterator + { + public: + WriteableIterator(LRUFileCache * file_cache_, LRUQueueIterator queue_iter_) : ReadableIterator(file_cache_, queue_iter_) { } - CachedFiles files; - LRUQueue queue; + void remove(std::lock_guard &) override + { + file_cache->cache_size -= queue_iter->size; + file_cache->queue.erase(queue_iter); + } - LRUQueue stash_queue; - AccessRecord records; + void incrementSize(size_t size_increment, std::lock_guard &) override + { + file_cache->cache_size += size_increment; + queue_iter->size += size_increment; + } - size_t max_stash_element_size; - size_t enable_cache_hits_threshold; - - Poco::Logger * log; - bool allow_to_remove_persistent_segments_from_cache_by_default; - - FileSegments getImpl( - const Key & key, const FileSegment::Range & range, - std::lock_guard & cache_lock); - - FileSegmentCell * getCell( - const Key & key, size_t offset, std::lock_guard & cache_lock); - - FileSegmentCell * addCell( - const Key & key, size_t offset, size_t size, - FileSegment::State state, bool is_persistent, - std::lock_guard & cache_lock); - - void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock); - - bool tryReserve( - const Key & key, size_t offset, size_t size, - std::lock_guard & cache_lock) override; - - bool tryReserveForMainList( - const Key & key, size_t offset, size_t size, - QueryContextPtr query_context, - std::lock_guard & cache_lock); - - void remove( - Key key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) override; - - bool isLastFileSegmentHolder( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) override; - - size_t getAvailableCacheSize() const; - - void loadCacheInfoIntoMemory(std::lock_guard & cache_lock); - - FileSegments splitRangeIntoCells( - const Key & key, size_t offset, size_t size, FileSegment::State state, bool is_persistent, std::lock_guard & cache_lock); - - String dumpStructureUnlocked(const Key & key_, std::lock_guard & cache_lock); - - void fillHolesWithEmptyFileSegments( - FileSegments & file_segments, const Key & key, const FileSegment::Range & range, bool fill_with_detached_file_segments, bool is_persistent, std::lock_guard & cache_lock); - - FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent) override; - - size_t getUsedCacheSizeUnlocked(std::lock_guard & cache_lock) const; - - size_t getAvailableCacheSizeUnlocked(std::lock_guard & cache_lock) const; - - size_t getFileSegmentsNumUnlocked(std::lock_guard & cache_lock) const; - - void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard & cache_lock); - - void reduceSizeToDownloaded( - const Key & key, size_t offset, - std::lock_guard & cache_lock, std::lock_guard & /* segment_lock */) override; + void use(std::lock_guard &) override + { + queue_iter->hits++; + file_cache->queue.splice(file_cache->queue.end(), file_cache->queue, queue_iter); + } + }; public: - String dumpStructure(const Key & key_) override; + LRUFileCache() = default; - void assertCacheCorrectness(const Key & key, std::lock_guard & cache_lock); + Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard &) override + { + auto iter = queue.insert(queue.end(), FileCacheRecord(key, offset, size)); + cache_size += size; + return std::make_shared(this, iter); + } - void assertCacheCorrectness(std::lock_guard & cache_lock); + bool contains(const Key & key, size_t offset, std::lock_guard &) override + { + for (const auto & record : queue) + { + if (key == record.key && offset == record.offset) + return true; + } + return false; + } - void assertQueueCorrectness(std::lock_guard & cache_lock); + void removeAll(std::lock_guard &) override + { + queue.clear(); + cache_size = 0; + } + + Iterator getNewIterator(std::lock_guard &) override { return std::make_shared(this, queue.begin()); } + + size_t getElementsNum(std::lock_guard &) const override { return queue.size(); } + + std::string toString(std::lock_guard &) const override { return {}; } + +private: + LRUQueue queue; }; -} +}; diff --git a/src/Common/tests/gtest_lru_file_cache.cpp b/src/Common/tests/gtest_lru_file_cache.cpp index 8e7554f0418..ac942d97a32 100644 --- a/src/Common/tests/gtest_lru_file_cache.cpp +++ b/src/Common/tests/gtest_lru_file_cache.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include From 50fd740ec34825962e1f4e17c7bba2e84742692f Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Sun, 26 Jun 2022 20:35:02 +0800 Subject: [PATCH 126/164] fix --- src/Common/FileCache.cpp | 2 +- src/Common/IFileCachePriority.h | 6 +++--- src/Common/LRUFileCache.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index 56eb4c9e081..bf981cc466e 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -772,7 +772,7 @@ void FileCache::removeIfReleasable(bool remove_persistent_files) { std::lock_guard segment_lock(file_segment->mutex); file_segment->detach(cache_lock, segment_lock); - remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock); + to_remove.emplace_back(file_segment); } } } diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index a29d66c70be..2e73bb92841 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -3,9 +3,9 @@ #include #include #include +#include #include #include -#include namespace DB { @@ -28,7 +28,7 @@ public: String toString() const; Key() = default; - explicit Key(const UInt128 & key_) : key(key_) {} + explicit Key(const UInt128 & key_) : key(key_) { } bool operator==(const Key & other) const { return key == other.key; } }; @@ -80,7 +80,7 @@ public: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support incrementSize() for IIterator."); } - virtual Key & key() const = 0; + virtual Key key() const = 0; virtual size_t offset() const = 0; diff --git a/src/Common/LRUFileCache.h b/src/Common/LRUFileCache.h index 0bd87b2b38c..6e42a7732d4 100644 --- a/src/Common/LRUFileCache.h +++ b/src/Common/LRUFileCache.h @@ -19,9 +19,9 @@ public: void next() override { queue_iter++; } - bool valid() const override { return queue_iter != file_cache->queue.end(); } + bool valid() const override { return (file_cache->queue.size() && (queue_iter != file_cache->queue.end())); } - Key & key() const override { return queue_iter->key; } + Key key() const override { return queue_iter->key; } size_t offset() const override { return queue_iter->offset; } From 9d83b93e88025f4052e5a32562377b26ab87a0cd Mon Sep 17 00:00:00 2001 From: KinderRiven Date: Wed, 10 Aug 2022 13:50:30 +0800 Subject: [PATCH 127/164] fix rebase --- src/Common/FileCache.cpp | 196 +++++++++++++- src/Common/FileCache.h | 247 ++++++++++++++---- src/Common/FileCacheType.h | 28 ++ src/Common/FileCache_fwd.h | 4 +- src/Common/FileSegment.cpp | 6 +- src/Common/FileSegment.h | 10 +- src/Common/IFileCache.cpp | 201 -------------- src/Common/IFileCache.h | 216 --------------- src/Common/IFileCachePriority.h | 68 ++--- ...{LRUFileCache.h => LRUFileCachePriority.h} | 41 +-- src/Common/tests/gtest_lru_file_cache.cpp | 2 +- src/Disks/IO/CachedReadBufferFromRemoteFS.cpp | 2 +- src/Disks/IO/CachedReadBufferFromRemoteFS.h | 6 +- .../ObjectStorages/DiskObjectStorage.cpp | 3 +- .../DiskObjectStorageCommon.cpp | 2 +- src/Disks/ObjectStorages/IObjectStorage.h | 1 + .../ObjectStorages/S3/S3ObjectStorage.cpp | 2 +- src/IO/WriteBufferFromS3.cpp | 2 +- src/Interpreters/AsynchronousMetrics.cpp | 2 +- .../InterpreterDescribeCacheQuery.cpp | 2 +- src/Interpreters/InterpreterSystemQuery.cpp | 2 +- .../System/StorageSystemFilesystemCache.cpp | 2 +- .../System/StorageSystemRemoteDataPaths.cpp | 2 +- 23 files changed, 480 insertions(+), 567 deletions(-) create mode 100644 src/Common/FileCacheType.h delete mode 100644 src/Common/IFileCache.cpp delete mode 100644 src/Common/IFileCache.h rename src/Common/{LRUFileCache.h => LRUFileCachePriority.h} (61%) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index bf981cc466e..2a2fc68e768 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include namespace fs = std::filesystem; @@ -23,10 +23,16 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -FileCache::FileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_) - : IFileCache(cache_base_path_, cache_settings_) - , main_priority(std::make_shared()) - , stash_priority(std::make_shared()) +FileCache::FileCache( + const String & cache_base_path_, + const FileCacheSettings & cache_settings_) + : cache_base_path(cache_base_path_) + , max_size(cache_settings_.max_size) + , max_element_size(cache_settings_.max_elements) + , max_file_segment_size(cache_settings_.max_file_segment_size) + , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit) + , main_priority(std::make_shared()) + , stash_priority(std::make_shared()) , max_stash_element_size(cache_settings_.max_elements) , enable_cache_hits_threshold(cache_settings_.enable_cache_hits_threshold) , log(&Poco::Logger::get("FileCache")) @@ -34,6 +40,175 @@ FileCache::FileCache(const String & cache_base_path_, const FileCacheSettings & { } +String FileCache::Key::toString() const +{ + return getHexUIntLowercase(key); +} + +FileCache::Key FileCache::hash(const String & path) +{ + return Key(sipHash128(path.data(), path.size())); +} + +String FileCache::getPathInLocalCache(const Key & key, size_t offset, bool is_persistent) const +{ + auto key_str = key.toString(); + return fs::path(cache_base_path) + / key_str.substr(0, 3) + / key_str + / (std::to_string(offset) + (is_persistent ? "_persistent" : "")); +} + +String FileCache::getPathInLocalCache(const Key & key) const +{ + auto key_str = key.toString(); + return fs::path(cache_base_path) / key_str.substr(0, 3) / key_str; +} + +static bool isQueryInitialized() +{ + return CurrentThread::isInitialized() + && CurrentThread::get().getQueryContext() + && CurrentThread::getQueryId().size != 0; +} + +bool FileCache::isReadOnly() +{ + return !isQueryInitialized(); +} + +void FileCache::assertInitialized() const +{ + if (!is_initialized) + throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache not initialized"); +} + +FileCache::QueryContextPtr FileCache::getCurrentQueryContext(std::lock_guard & cache_lock) +{ + if (!isQueryInitialized()) + return nullptr; + + return getQueryContext(CurrentThread::getQueryId().toString(), cache_lock); +} + +FileCache::QueryContextPtr FileCache::getQueryContext(const String & query_id, std::lock_guard & /* cache_lock */) +{ + auto query_iter = query_map.find(query_id); + return (query_iter == query_map.end()) ? nullptr : query_iter->second; +} + +void FileCache::removeQueryContext(const String & query_id) +{ + std::lock_guard cache_lock(mutex); + auto query_iter = query_map.find(query_id); + + if (query_iter == query_map.end()) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Attempt to release query context that does not exist (query_id: {})", + query_id); + } + + query_map.erase(query_iter); +} + +FileCache::QueryContextPtr FileCache::getOrSetQueryContext( + const String & query_id, const ReadSettings & settings, std::lock_guard & cache_lock) +{ + if (query_id.empty()) + return nullptr; + + auto context = getQueryContext(query_id, cache_lock); + if (context) + return context; + + auto query_context = std::make_shared(settings.max_query_cache_size, settings.skip_download_if_exceeds_query_cache); + auto query_iter = query_map.emplace(query_id, query_context).first; + return query_iter->second; +} + +FileCache::QueryContextHolder FileCache::getQueryContextHolder(const String & query_id, const ReadSettings & settings) +{ + std::lock_guard cache_lock(mutex); + + if (!enable_filesystem_query_cache_limit || settings.max_query_cache_size == 0) + return {}; + + /// if enable_filesystem_query_cache_limit is true, and max_query_cache_size large than zero, + /// we create context query for current query. + auto context = getOrSetQueryContext(query_id, settings, cache_lock); + return QueryContextHolder(query_id, this, context); +} + +void FileCache::QueryContext::remove(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) +{ + if (cache_size < size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size"); + + if (!skip_download_if_exceeds_query_cache) + { + auto record = records.find({key, offset}); + if (record != records.end()) + { + record->second->remove(cache_lock); + records.erase({key, offset}); + } + } + cache_size -= size; +} + +void FileCache::QueryContext::reserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) +{ + if (cache_size + size > max_cache_size) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Reserved cache size exceeds the remaining cache size (key: {}, offset: {})", + key.toString(), offset); + } + + if (!skip_download_if_exceeds_query_cache) + { + auto record = records.find({key, offset}); + if (record == records.end()) + { + auto queue_iter = priority->add(key, offset, 0, cache_lock); + record = records.insert({{key, offset}, queue_iter}).first; + } + record->second->incrementSize(size, cache_lock); + } + cache_size += size; +} + +void FileCache::QueryContext::use(const Key & key, size_t offset, std::lock_guard & cache_lock) +{ + if (skip_download_if_exceeds_query_cache) + return; + + auto record = records.find({key, offset}); + if (record != records.end()) + record->second->use(cache_lock); +} + +FileCache::QueryContextHolder::QueryContextHolder( + const String & query_id_, + FileCache * cache_, + FileCache::QueryContextPtr context_) + : query_id(query_id_) + , cache(cache_) + , context(context_) +{ +} + +FileCache::QueryContextHolder::~QueryContextHolder() +{ + /// If only the query_map and the current holder hold the context_query, + /// the query has been completed and the query_context is released. + if (context && context.use_count() == 2) + cache->removeQueryContext(query_id); +} + void FileCache::initialize() { std::lock_guard cache_lock(mutex); @@ -115,7 +290,7 @@ FileSegments FileCache::getImpl( files.erase(key); /// Note: it is guaranteed that there is no concurrency with files deletion, - /// because cache files are deleted only inside IFileCache and under cache lock. + /// because cache files are deleted only inside FileCache and under cache lock. if (fs::exists(key_path)) fs::remove_all(key_path); @@ -387,7 +562,7 @@ FileCache::FileSegmentCell * FileCache::addCell( if (stash_priority->getElementsNum(cache_lock) > max_stash_element_size) { - auto remove_priority_iter = stash_priority->getNewIterator(cache_lock); + auto remove_priority_iter = stash_priority->getNewIterator(cache_lock)->getWriteIterator(); stash_records.erase({remove_priority_iter->key(), remove_priority_iter->offset()}); remove_priority_iter->remove(cache_lock); } @@ -473,7 +648,7 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc auto * cell_for_reserve = getCell(key, offset, cache_lock); - std::vector ghost; + std::vector ghost; std::vector trash; std::vector to_evict; @@ -496,7 +671,7 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc { /// The cache corresponding to this record may be swapped out by /// other queries, so it has become invalid. - ghost.push_back(iter->getSnapshot()); + ghost.push_back(iter->getWriteIterator()); removed_size += iter->size(); } else @@ -844,10 +1019,9 @@ void FileCache::loadCacheInfoIntoMemory(std::lock_guard & cache_lock Key key; UInt64 offset = 0; size_t size = 0; - std::vector>> queue_entries; + std::vector>> queue_entries; /// cache_base_path / key_prefix / key / offset - if (!files.empty()) throw Exception( ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, diff --git a/src/Common/FileCache.h b/src/Common/FileCache.h index 5aa32ac94ca..bccd2dbd458 100644 --- a/src/Common/FileCache.h +++ b/src/Common/FileCache.h @@ -3,50 +3,196 @@ #include #include #include +#include #include #include #include #include #include #include -#include -#include +#include +#include +#include #include -#include - +#include +#include +#include namespace DB { /** * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. - * Implements LRU eviction policy. - */ -class FileCache final : public IFileCache + */ +class FileCache : private boost::noncopyable { + friend class FileSegment; + friend class IFileCachePriority; + friend struct FileSegmentsHolder; + friend class FileSegmentRangeWriter; + public: - FileCache( - const String & cache_base_path_, - const FileCacheSettings & cache_settings_); + using Key = DB::FileCacheKey; - FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) override; + FileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_); - FileSegmentsHolder get(const Key & key, size_t offset, size_t size) override; + ~FileCache() = default; - FileSegments getSnapshot() const override; + /// Restore cache from local filesystem. + void initialize(); - void initialize() override; + void removeIfExists(const Key & key); - void removeIfExists(const Key & key) override; + void removeIfReleasable(bool remove_persistent_files); - void removeIfReleasable(bool remove_persistent_files) override; + static bool isReadOnly(); - std::vector tryGetCachePaths(const Key & key) override; + /// Cache capacity in bytes. + size_t capacity() const { return max_size; } - size_t getUsedCacheSize() const override; + static Key hash(const String & path); - size_t getFileSegmentsNum() const override; + String getPathInLocalCache(const Key & key, size_t offset, bool is_persistent) const; + + String getPathInLocalCache(const Key & key) const; + + const String & getBasePath() const { return cache_base_path; } + + std::vector tryGetCachePaths(const Key & key); + + /** + * Given an `offset` and `size` representing [offset, offset + size) bytes interval, + * return list of cached non-overlapping non-empty + * file segments `[segment1, ..., segmentN]` which intersect with given interval. + * + * Segments in returned list are ordered in ascending order and represent a full contiguous + * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. + * + * As long as pointers to returned file segments are hold + * it is guaranteed that these file segments are not removed from cache. + */ + FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent); + + /** + * Segments in returned list are ordered in ascending order and represent a full contiguous + * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. + * + * If file segment has state EMPTY, then it is also marked as "detached". E.g. it is "detached" + * from cache (not owned by cache), and as a result will never change it's state and will be destructed + * with the destruction of the holder, while in getOrSet() EMPTY file segments can eventually change + * it's state (and become DOWNLOADED). + */ + FileSegmentsHolder get(const Key & key, size_t offset, size_t size); + + FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent); + + FileSegments getSnapshot() const; + + /// For debug. + String dumpStructure(const Key & key); + + size_t getUsedCacheSize() const; + + size_t getFileSegmentsNum() const; + +private: + String cache_base_path; + size_t max_size; + size_t max_element_size; + size_t max_file_segment_size; + + bool is_initialized = false; + + mutable std::mutex mutex; + + bool tryReserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock); + + void remove(Key key, size_t offset, std::lock_guard & cache_lock, std::lock_guard & segment_lock); + + bool isLastFileSegmentHolder( + const Key & key, size_t offset, std::lock_guard & cache_lock, std::lock_guard & segment_lock); + + void reduceSizeToDownloaded( + const Key & key, size_t offset, std::lock_guard & cache_lock, std::lock_guard & /* segment_lock */); + + void assertInitialized() const; + + using AccessKeyAndOffset = std::pair; + struct KeyAndOffsetHash + { + std::size_t operator()(const AccessKeyAndOffset & key) const + { + return std::hash()(key.first.key) ^ std::hash()(key.second); + } + }; + + using FileCacheRecords = std::unordered_map; + + /// Used to track and control the cache access of each query. + /// Through it, we can realize the processing of different queries by the cache layer. + struct QueryContext + { + FileCacheRecords records; + FileCachePriorityPtr priority; + + size_t cache_size = 0; + size_t max_cache_size; + + bool skip_download_if_exceeds_query_cache; + + QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_) + : max_cache_size(max_cache_size_), skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) + { + } + + void remove(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock); + + void reserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock); + + void use(const Key & key, size_t offset, std::lock_guard & cache_lock); + + size_t getMaxCacheSize() const { return max_cache_size; } + + size_t getCacheSize() const { return cache_size; } + + FileCachePriorityPtr getPriority() { return priority; } + + bool isSkipDownloadIfExceed() const { return skip_download_if_exceeds_query_cache; } + }; + + using QueryContextPtr = std::shared_ptr; + using QueryContextMap = std::unordered_map; + + QueryContextMap query_map; + + bool enable_filesystem_query_cache_limit; + + QueryContextPtr getCurrentQueryContext(std::lock_guard & cache_lock); + + QueryContextPtr getQueryContext(const String & query_id, std::lock_guard & cache_lock); + + void removeQueryContext(const String & query_id); + + QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard &); + +public: + /// Save a query context information, and adopt different cache policies + /// for different queries through the context cache layer. + struct QueryContextHolder : private boost::noncopyable + { + QueryContextHolder(const String & query_id_, FileCache * cache_, QueryContextPtr context_); + + QueryContextHolder() = default; + + ~QueryContextHolder(); + + String query_id; + FileCache * cache = nullptr; + QueryContextPtr context; + }; + + QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings); private: struct FileSegmentCell : private boost::noncopyable @@ -54,20 +200,21 @@ private: FileSegmentPtr file_segment; /// Iterator is put here on first reservation attempt, if successful. - IFileCachePriority::Iterator queue_iterator; + IFileCachePriority::WriteIterator queue_iterator; /// Pointer to file segment is always hold by the cache itself. /// Apart from pointer in cache, it can be hold by cache users, when they call /// getorSet(), but cache users always hold it via FileSegmentsHolder. - bool releasable() const {return file_segment.unique(); } + bool releasable() const { return file_segment.unique(); } size_t size() const { return file_segment->reserved_size; } FileSegmentCell(FileSegmentPtr file_segment_, FileCache * cache, std::lock_guard & cache_lock); FileSegmentCell(FileSegmentCell && other) noexcept - : file_segment(std::move(other.file_segment)) - , queue_iterator(other.queue_iterator) {} + : file_segment(std::move(other.file_segment)), queue_iterator(other.queue_iterator) + { + } }; using FileSegmentsByOffset = std::map; @@ -85,52 +232,44 @@ private: Poco::Logger * log; bool allow_to_remove_persistent_segments_from_cache_by_default; - FileSegments getImpl( - const Key & key, const FileSegment::Range & range, - std::lock_guard & cache_lock); + FileSegments getImpl(const Key & key, const FileSegment::Range & range, std::lock_guard & cache_lock); - FileSegmentCell * getCell( - const Key & key, size_t offset, std::lock_guard & cache_lock); + FileSegmentCell * getCell(const Key & key, size_t offset, std::lock_guard & cache_lock); FileSegmentCell * addCell( - const Key & key, size_t offset, size_t size, - FileSegment::State state, bool is_persistent, + const Key & key, + size_t offset, + size_t size, + FileSegment::State state, + bool is_persistent, std::lock_guard & cache_lock); void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock); - bool tryReserve( - const Key & key, size_t offset, size_t size, - std::lock_guard & cache_lock) override; - bool tryReserveForMainList( - const Key & key, size_t offset, size_t size, - QueryContextPtr query_context, - std::lock_guard & cache_lock); - - void remove( - Key key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) override; - - bool isLastFileSegmentHolder( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) override; + const Key & key, size_t offset, size_t size, QueryContextPtr query_context, std::lock_guard & cache_lock); size_t getAvailableCacheSize() const; void loadCacheInfoIntoMemory(std::lock_guard & cache_lock); FileSegments splitRangeIntoCells( - const Key & key, size_t offset, size_t size, FileSegment::State state, bool is_persistent, std::lock_guard & cache_lock); + const Key & key, + size_t offset, + size_t size, + FileSegment::State state, + bool is_persistent, + std::lock_guard & cache_lock); String dumpStructureUnlocked(const Key & key_, std::lock_guard & cache_lock); void fillHolesWithEmptyFileSegments( - FileSegments & file_segments, const Key & key, const FileSegment::Range & range, bool fill_with_detached_file_segments, bool is_persistent, std::lock_guard & cache_lock); - - FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent) override; + FileSegments & file_segments, + const Key & key, + const FileSegment::Range & range, + bool fill_with_detached_file_segments, + bool is_persistent, + std::lock_guard & cache_lock); size_t getUsedCacheSizeUnlocked(std::lock_guard & cache_lock) const; @@ -140,13 +279,7 @@ private: void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard & cache_lock); - void reduceSizeToDownloaded( - const Key & key, size_t offset, - std::lock_guard & cache_lock, std::lock_guard & /* segment_lock */) override; - public: - String dumpStructure(const Key & key_) override; - void assertCacheCorrectness(const Key & key, std::lock_guard & cache_lock); void assertCacheCorrectness(std::lock_guard & cache_lock); diff --git a/src/Common/FileCacheType.h b/src/Common/FileCacheType.h new file mode 100644 index 00000000000..9b3ec5a6af0 --- /dev/null +++ b/src/Common/FileCacheType.h @@ -0,0 +1,28 @@ +#pragma once +#include + +namespace DB +{ + +struct FileCacheKey +{ + UInt128 key; + String toString() const; + + FileCacheKey() = default; + explicit FileCacheKey(const UInt128 & key_) : key(key_) { } + + bool operator==(const FileCacheKey & other) const { return key == other.key; } +}; + +} + +namespace std +{ +template <> +struct hash +{ + std::size_t operator()(const DB::FileCacheKey & k) const { return hash()(k.key); } +}; + +} diff --git a/src/Common/FileCache_fwd.h b/src/Common/FileCache_fwd.h index 8a7c2eeb458..9f6b2a740fc 100644 --- a/src/Common/FileCache_fwd.h +++ b/src/Common/FileCache_fwd.h @@ -9,8 +9,8 @@ static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 100 static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS = 1024 * 1024; static constexpr int REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD = 0; -class IFileCache; -using FileCachePtr = std::shared_ptr; +class FileCache; +using FileCachePtr = std::shared_ptr; struct FileCacheSettings; diff --git a/src/Common/FileSegment.cpp b/src/Common/FileSegment.cpp index c16d4658ae5..2aba93bbdb0 100644 --- a/src/Common/FileSegment.cpp +++ b/src/Common/FileSegment.cpp @@ -5,7 +5,7 @@ #include #include #include - +#include namespace CurrentMetrics { @@ -25,7 +25,7 @@ FileSegment::FileSegment( size_t offset_, size_t size_, const Key & key_, - IFileCache * cache_, + FileCache * cache_, State download_state_, bool is_persistent_) : segment_range(offset_, offset_ + size_ - 1) @@ -787,7 +787,7 @@ FileSegmentsHolder::~FileSegmentsHolder() /// FileSegmentsHolder right after calling file_segment->complete(), so on destruction here /// remain only uncompleted file segments. - IFileCache * cache = nullptr; + FileCache * cache = nullptr; for (auto file_segment_it = file_segments.begin(); file_segment_it != file_segments.end();) { diff --git a/src/Common/FileSegment.h b/src/Common/FileSegment.h index 4404d0e14be..b129b851d7c 100644 --- a/src/Common/FileSegment.h +++ b/src/Common/FileSegment.h @@ -1,11 +1,11 @@ #pragma once #include -#include #include #include #include #include +#include namespace Poco { class Logger; } @@ -17,7 +17,7 @@ extern const Metric CacheFileSegments; namespace DB { -class IFileCache; +class FileCache; class FileSegment; using FileSegmentPtr = std::shared_ptr; @@ -32,7 +32,7 @@ friend struct FileSegmentsHolder; friend class FileSegmentRangeWriter; public: - using Key = IFileCache::Key; + using Key = FileCacheKey; using RemoteFileReaderPtr = std::shared_ptr; using LocalCacheWriterPtr = std::unique_ptr; @@ -74,7 +74,7 @@ public: size_t offset_, size_t size_, const Key & key_, - IFileCache * cache_, + FileCache * cache_, State download_state_, bool is_persistent_ = false); @@ -234,7 +234,7 @@ private: mutable std::mutex download_mutex; Key file_key; - IFileCache * cache; + FileCache * cache; Poco::Logger * log; diff --git a/src/Common/IFileCache.cpp b/src/Common/IFileCache.cpp deleted file mode 100644 index e3ed82d7b62..00000000000 --- a/src/Common/IFileCache.cpp +++ /dev/null @@ -1,201 +0,0 @@ -#include "IFileCache.h" - -#include -#include -#include -#include -#include -#include - -namespace fs = std::filesystem; - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int REMOTE_FS_OBJECT_CACHE_ERROR; - extern const int LOGICAL_ERROR; -} - -IFileCache::IFileCache( - const String & cache_base_path_, - const FileCacheSettings & cache_settings_) - : cache_base_path(cache_base_path_) - , max_size(cache_settings_.max_size) - , max_element_size(cache_settings_.max_elements) - , max_file_segment_size(cache_settings_.max_file_segment_size) - , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit) -{ -} - -String IFileCache::Key::toString() const -{ - return getHexUIntLowercase(key); -} - -IFileCache::Key IFileCache::hash(const String & path) -{ - return Key(sipHash128(path.data(), path.size())); -} - -String IFileCache::getPathInLocalCache(const Key & key, size_t offset, bool is_persistent) const -{ - auto key_str = key.toString(); - return fs::path(cache_base_path) - / key_str.substr(0, 3) - / key_str - / (std::to_string(offset) + (is_persistent ? "_persistent" : "")); -} - -String IFileCache::getPathInLocalCache(const Key & key) const -{ - auto key_str = key.toString(); - return fs::path(cache_base_path) / key_str.substr(0, 3) / key_str; -} - -static bool isQueryInitialized() -{ - return CurrentThread::isInitialized() - && CurrentThread::get().getQueryContext() - && !CurrentThread::getQueryId().empty(); -} - -bool IFileCache::isReadOnly() -{ - return !isQueryInitialized(); -} - -void IFileCache::assertInitialized() const -{ - if (!is_initialized) - throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache not initialized"); -} - -IFileCache::QueryContextPtr IFileCache::getCurrentQueryContext(std::lock_guard & cache_lock) -{ - if (!isQueryInitialized()) - return nullptr; - - return getQueryContext(std::string(CurrentThread::getQueryId()), cache_lock); -} - -IFileCache::QueryContextPtr IFileCache::getQueryContext(const String & query_id, std::lock_guard & /* cache_lock */) -{ - auto query_iter = query_map.find(query_id); - return (query_iter == query_map.end()) ? nullptr : query_iter->second; -} - -void IFileCache::removeQueryContext(const String & query_id) -{ - std::lock_guard cache_lock(mutex); - auto query_iter = query_map.find(query_id); - - if (query_iter == query_map.end()) - { - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Attempt to release query context that does not exist (query_id: {})", - query_id); - } - - query_map.erase(query_iter); -} - -IFileCache::QueryContextPtr IFileCache::getOrSetQueryContext( - const String & query_id, const ReadSettings & settings, std::lock_guard & cache_lock) -{ - if (query_id.empty()) - return nullptr; - - auto context = getQueryContext(query_id, cache_lock); - if (context) - return context; - - auto query_context = std::make_shared(settings.max_query_cache_size, settings.skip_download_if_exceeds_query_cache); - auto query_iter = query_map.emplace(query_id, query_context).first; - return query_iter->second; -} - -IFileCache::QueryContextHolder IFileCache::getQueryContextHolder(const String & query_id, const ReadSettings & settings) -{ - std::lock_guard cache_lock(mutex); - - if (!enable_filesystem_query_cache_limit || settings.max_query_cache_size == 0) - return {}; - - /// if enable_filesystem_query_cache_limit is true, and max_query_cache_size large than zero, - /// we create context query for current query. - auto context = getOrSetQueryContext(query_id, settings, cache_lock); - return QueryContextHolder(query_id, this, context); -} - -void IFileCache::QueryContext::remove(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) -{ - if (cache_size < size) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size"); - - if (!skip_download_if_exceeds_query_cache) - { - auto record = records.find({key, offset}); - if (record != records.end()) - { - record->second->remove(cache_lock); - records.erase({key, offset}); - } - } - cache_size -= size; -} - -void IFileCache::QueryContext::reserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) -{ - if (cache_size + size > max_cache_size) - { - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Reserved cache size exceeds the remaining cache size (key: {}, offset: {})", - key.toString(), offset); - } - - if (!skip_download_if_exceeds_query_cache) - { - auto record = records.find({key, offset}); - if (record == records.end()) - { - auto queue_iter = priority->add(key, offset, 0, cache_lock); - record = records.insert({{key, offset}, queue_iter}).first; - } - record->second->incrementSize(size, cache_lock); - } - cache_size += size; -} - -void IFileCache::QueryContext::use(const Key & key, size_t offset, std::lock_guard & cache_lock) -{ - if (skip_download_if_exceeds_query_cache) - return; - - auto record = records.find({key, offset}); - if (record != records.end()) - record->second->use(cache_lock); -} - -IFileCache::QueryContextHolder::QueryContextHolder( - const String & query_id_, - IFileCache * cache_, - IFileCache::QueryContextPtr context_) - : query_id(query_id_) - , cache(cache_) - , context(context_) -{ -} - -IFileCache::QueryContextHolder::~QueryContextHolder() -{ - /// If only the query_map and the current holder hold the context_query, - /// the query has been completed and the query_context is released. - if (context && context.use_count() == 2) - cache->removeQueryContext(query_id); -} - -} diff --git a/src/Common/IFileCache.h b/src/Common/IFileCache.h deleted file mode 100644 index f46a83d52cf..00000000000 --- a/src/Common/IFileCache.h +++ /dev/null @@ -1,216 +0,0 @@ -#pragma once - -#include -#include -#include - -#include -#include -#include -#include - - -namespace DB -{ - -class FileSegment; -using FileSegmentPtr = std::shared_ptr; -using FileSegments = std::list; -struct FileSegmentsHolder; -struct ReadSettings; - -/** - * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. - */ -class IFileCache : private boost::noncopyable -{ -friend class FileSegment; -friend struct FileSegmentsHolder; -friend class FileSegmentRangeWriter; - -public: - using Key = IFileCachePriority::Key; - - IFileCache( - const String & cache_base_path_, - const FileCacheSettings & cache_settings_); - - virtual ~IFileCache() = default; - - /// Restore cache from local filesystem. - virtual void initialize() = 0; - - virtual void removeIfExists(const Key & key) = 0; - - virtual void removeIfReleasable(bool remove_persistent_files) = 0; - - static bool isReadOnly(); - - /// Cache capacity in bytes. - size_t capacity() const { return max_size; } - - static Key hash(const String & path); - - String getPathInLocalCache(const Key & key, size_t offset, bool is_persistent) const; - - String getPathInLocalCache(const Key & key) const; - - const String & getBasePath() const { return cache_base_path; } - - virtual std::vector tryGetCachePaths(const Key & key) = 0; - - /** - * Given an `offset` and `size` representing [offset, offset + size) bytes interval, - * return list of cached non-overlapping non-empty - * file segments `[segment1, ..., segmentN]` which intersect with given interval. - * - * Segments in returned list are ordered in ascending order and represent a full contiguous - * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. - * - * As long as pointers to returned file segments are hold - * it is guaranteed that these file segments are not removed from cache. - */ - virtual FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) = 0; - - /** - * Segments in returned list are ordered in ascending order and represent a full contiguous - * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. - * - * If file segment has state EMPTY, then it is also marked as "detached". E.g. it is "detached" - * from cache (not owned by cache), and as a result will never change it's state and will be destructed - * with the destruction of the holder, while in getOrSet() EMPTY file segments can eventually change - * it's state (and become DOWNLOADED). - */ - virtual FileSegmentsHolder get(const Key & key, size_t offset, size_t size) = 0; - - virtual FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent) = 0; - - virtual FileSegments getSnapshot() const = 0; - - /// For debug. - virtual String dumpStructure(const Key & key) = 0; - - virtual size_t getUsedCacheSize() const = 0; - - virtual size_t getFileSegmentsNum() const = 0; - -protected: - String cache_base_path; - size_t max_size; - size_t max_element_size; - size_t max_file_segment_size; - - bool is_initialized = false; - - mutable std::mutex mutex; - - virtual bool tryReserve( - const Key & key, size_t offset, size_t size, - std::lock_guard & cache_lock) = 0; - - virtual void remove( - Key key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) = 0; - - virtual bool isLastFileSegmentHolder( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) = 0; - - virtual void reduceSizeToDownloaded( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & /* segment_lock */) = 0; - - void assertInitialized() const; - - using AccessKeyAndOffset = std::pair; - struct KeyAndOffsetHash - { - std::size_t operator()(const AccessKeyAndOffset & key) const - { - return std::hash()(key.first.key) ^ std::hash()(key.second); - } - }; - - using FileCacheRecords = std::unordered_map; - - /// Used to track and control the cache access of each query. - /// Through it, we can realize the processing of different queries by the cache layer. - struct QueryContext - { - FileCacheRecords records; - FileCachePriorityPtr priority; - - size_t cache_size = 0; - size_t max_cache_size; - - bool skip_download_if_exceeds_query_cache; - - QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_) - : max_cache_size(max_cache_size_) - , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {} - - void remove(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock); - - void reserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock); - - void use(const Key & key, size_t offset, std::lock_guard & cache_lock); - - size_t getMaxCacheSize() const { return max_cache_size; } - - size_t getCacheSize() const { return cache_size; } - - FileCachePriorityPtr getPriority() { return priority; } - - bool isSkipDownloadIfExceed() const { return skip_download_if_exceeds_query_cache; } - }; - - using QueryContextPtr = std::shared_ptr; - using QueryContextMap = std::unordered_map; - - QueryContextMap query_map; - - bool enable_filesystem_query_cache_limit; - - QueryContextPtr getCurrentQueryContext(std::lock_guard & cache_lock); - - QueryContextPtr getQueryContext(const String & query_id, std::lock_guard & cache_lock); - - void removeQueryContext(const String & query_id); - - QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard &); - -public: - /// Save a query context information, and adopt different cache policies - /// for different queries through the context cache layer. - struct QueryContextHolder : private boost::noncopyable - { - QueryContextHolder(const String & query_id_, IFileCache * cache_, QueryContextPtr context_); - - QueryContextHolder() = default; - - ~QueryContextHolder(); - - String query_id; - IFileCache * cache = nullptr; - QueryContextPtr context; - }; - - QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings); - -}; - -using FileCachePtr = std::shared_ptr; - -} - -namespace std -{ -template <> struct hash -{ - std::size_t operator()(const DB::IFileCache::Key & k) const { return hash()(k.key); } -}; - -} diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index 2e73bb92841..84e1a386d24 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -5,7 +5,8 @@ #include #include #include -#include +#include +#include namespace DB { @@ -15,6 +16,8 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } +class FileCache; + class IFileCachePriority; using FileCachePriorityPtr = std::shared_ptr; @@ -22,20 +25,14 @@ using FileCachePriorityPtr = std::shared_ptr; class IFileCachePriority { public: - struct Key - { - UInt128 key; - String toString() const; - - Key() = default; - explicit Key(const UInt128 & key_) : key(key_) { } - - bool operator==(const Key & other) const { return key == other.key; } - }; - class IIterator; friend class IIterator; - using Iterator = std::shared_ptr; + + using ReadIterator = std::shared_ptr; + using WriteIterator = std::shared_ptr; + + friend class FileCache; + using Key = FileCacheKey; struct FileCacheRecord { @@ -56,30 +53,6 @@ public: public: virtual ~IIterator() = default; - virtual void next() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support next() for IIterator."); } - - virtual bool valid() const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support valid() for IIterator."); } - - /// Mark a cache record as recently used, it will update the priority - /// of the cache record according to different cache algorithms. - virtual void use(std::lock_guard &) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support use() for IIterator."); - } - - /// Deletes an existing cached record. - virtual void remove(std::lock_guard &) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support remove() for IIterator."); - } - - virtual Iterator getSnapshot() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support getSnapshot() for IIterator."); } - - virtual void incrementSize(size_t, std::lock_guard &) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not support incrementSize() for IIterator."); - } - virtual Key key() const = 0; virtual size_t offset() const = 0; @@ -87,6 +60,23 @@ public: virtual size_t size() const = 0; virtual size_t hits() const = 0; + + virtual void next() const = 0; + + virtual bool valid() const = 0; + + /// Mark a cache record as recently used, it will update the priority + /// of the cache record according to different cache algorithms. + virtual void use(std::lock_guard &) = 0; + + /// Deletes an existing cached record. + virtual void remove(std::lock_guard &) = 0; + + virtual WriteIterator getWriteIterator() const = 0; + + virtual void incrementSize(size_t, std::lock_guard &) = 0; + + virtual void seekToLowestPriority() const = 0; }; public: @@ -94,7 +84,7 @@ public: /// Add a cache record that did not exist before, and throw a /// logical exception if the cache block already exists. - virtual Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) = 0; + virtual WriteIterator add(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) = 0; /// Query whether a cache record exists. If it exists, return true. If not, return false. virtual bool contains(const Key & key, size_t offset, std::lock_guard & cache_lock) = 0; @@ -103,7 +93,7 @@ public: /// Returns an iterator pointing to the lowest priority cached record. /// We can traverse all cached records through the iterator's next(). - virtual Iterator getNewIterator(std::lock_guard & cache_lock) = 0; + virtual ReadIterator getNewIterator(std::lock_guard & cache_lock) = 0; virtual size_t getElementsNum(std::lock_guard & cache_lock) const = 0; diff --git a/src/Common/LRUFileCache.h b/src/Common/LRUFileCachePriority.h similarity index 61% rename from src/Common/LRUFileCache.h rename to src/Common/LRUFileCachePriority.h index 6e42a7732d4..bc9badc0af6 100644 --- a/src/Common/LRUFileCache.h +++ b/src/Common/LRUFileCachePriority.h @@ -5,19 +5,23 @@ namespace DB { -class LRUFileCache : public IFileCachePriority +/// Based on the LRU algorithm implementation, the data with the lowest priority is stored at +/// the head of the queue, and the data with the highest priority is stored at the tail. +class LRUFileCachePriority : public IFileCachePriority { public: using LRUQueue = std::list; using LRUQueueIterator = typename LRUQueue::iterator; - class WriteableIterator; - class ReadableIterator : public IIterator + class LRUFileCacheIterator : public IIterator { public: - ReadableIterator(LRUFileCache * file_cache_, LRUQueueIterator queue_iter_) : file_cache(file_cache_), queue_iter(queue_iter_) { } + LRUFileCacheIterator(LRUFileCachePriority * file_cache_, LRUQueueIterator queue_iter_) + : file_cache(file_cache_), queue_iter(queue_iter_) + { + } - void next() override { queue_iter++; } + void next() const override { queue_iter++; } bool valid() const override { return (file_cache->queue.size() && (queue_iter != file_cache->queue.end())); } @@ -29,17 +33,9 @@ public: size_t hits() const override { return queue_iter->hits; } - Iterator getSnapshot() override { return std::make_shared(file_cache, queue_iter); } + WriteIterator getWriteIterator() const override { return std::make_shared(file_cache, queue_iter); } - protected: - LRUFileCache * file_cache; - LRUQueueIterator queue_iter; - }; - - class WriteableIterator : public ReadableIterator - { - public: - WriteableIterator(LRUFileCache * file_cache_, LRUQueueIterator queue_iter_) : ReadableIterator(file_cache_, queue_iter_) { } + void seekToLowestPriority() const override { queue_iter = file_cache->queue.begin(); } void remove(std::lock_guard &) override { @@ -58,16 +54,20 @@ public: queue_iter->hits++; file_cache->queue.splice(file_cache->queue.end(), file_cache->queue, queue_iter); } + + private: + mutable LRUFileCachePriority * file_cache; + mutable LRUQueueIterator queue_iter; }; public: - LRUFileCache() = default; + LRUFileCachePriority() = default; - Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard &) override + WriteIterator add(const Key & key, size_t offset, size_t size, std::lock_guard &) override { auto iter = queue.insert(queue.end(), FileCacheRecord(key, offset, size)); cache_size += size; - return std::make_shared(this, iter); + return std::make_shared(this, iter); } bool contains(const Key & key, size_t offset, std::lock_guard &) override @@ -86,7 +86,10 @@ public: cache_size = 0; } - Iterator getNewIterator(std::lock_guard &) override { return std::make_shared(this, queue.begin()); } + ReadIterator getNewIterator(std::lock_guard &) override + { + return std::make_shared(this, queue.begin()); + } size_t getElementsNum(std::lock_guard &) const override { return queue.size(); } diff --git a/src/Common/tests/gtest_lru_file_cache.cpp b/src/Common/tests/gtest_lru_file_cache.cpp index ac942d97a32..3f481ee25ca 100644 --- a/src/Common/tests/gtest_lru_file_cache.cpp +++ b/src/Common/tests/gtest_lru_file_cache.cpp @@ -47,7 +47,7 @@ std::vector fromHolder(const DB::FileSegmentsHolder & holder return std::vector(holder.file_segments.begin(), holder.file_segments.end()); } -String getFileSegmentPath(const String & base_path, const DB::IFileCache::Key & key, size_t offset) +String getFileSegmentPath(const String & base_path, const DB::FileCache::Key & key, size_t offset) { auto key_str = key.toString(); return fs::path(base_path) / key_str.substr(0, 3) / key_str / DB::toString(offset); diff --git a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp index a3d5cfc408d..a10e136334e 100644 --- a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp +++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp @@ -1024,7 +1024,7 @@ std::optional CachedReadBufferFromRemoteFS::getLastNonDownloadedOffset() void CachedReadBufferFromRemoteFS::assertCorrectness() const { - if (IFileCache::isReadOnly() && !settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache) + if (FileCache::isReadOnly() && !settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache usage is not allowed"); } diff --git a/src/Disks/IO/CachedReadBufferFromRemoteFS.h b/src/Disks/IO/CachedReadBufferFromRemoteFS.h index aff29dd200c..7fe3af29ef7 100644 --- a/src/Disks/IO/CachedReadBufferFromRemoteFS.h +++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include #include @@ -81,7 +81,7 @@ private: bool writeCache(char * data, size_t size, size_t offset, FileSegment & file_segment); Poco::Logger * log; - IFileCache::Key cache_key; + FileCache::Key cache_key; String remote_fs_object_path; FileCachePtr cache; ReadSettings settings; @@ -128,7 +128,7 @@ private: CurrentMetrics::Increment metric_increment{CurrentMetrics::FilesystemCacheReadBuffers}; ProfileEvents::Counters current_file_segment_counters; - IFileCache::QueryContextHolder query_context_holder; + FileCache::QueryContextHolder query_context_holder; bool is_persistent; }; diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index 970a971d5dc..0849a3f09e3 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -10,7 +10,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/src/Disks/ObjectStorages/DiskObjectStorageCommon.cpp b/src/Disks/ObjectStorages/DiskObjectStorageCommon.cpp index b8ab2f49202..499791caf94 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageCommon.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageCommon.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 1ab2d75ff86..69c1c31403d 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -16,6 +16,7 @@ #include #include #include +#include #include diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 25dafac4120..901deeebefc 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 51f0c0d0743..79da7832a34 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -3,8 +3,8 @@ #if USE_AWS_S3 #include -#include #include +#include #include #include diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 9fd27fc28b6..f9bc22dd110 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -12,9 +12,9 @@ #include #include #include -#include #include #include +#include #include #include #include diff --git a/src/Interpreters/InterpreterDescribeCacheQuery.cpp b/src/Interpreters/InterpreterDescribeCacheQuery.cpp index dd6df26c6af..d7c13dbb077 100644 --- a/src/Interpreters/InterpreterDescribeCacheQuery.cpp +++ b/src/Interpreters/InterpreterDescribeCacheQuery.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 695ea53e65e..b37274a3152 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/System/StorageSystemFilesystemCache.cpp b/src/Storages/System/StorageSystemFilesystemCache.cpp index 2baddadec90..6d711498091 100644 --- a/src/Storages/System/StorageSystemFilesystemCache.cpp +++ b/src/Storages/System/StorageSystemFilesystemCache.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/System/StorageSystemRemoteDataPaths.cpp b/src/Storages/System/StorageSystemRemoteDataPaths.cpp index a482f5d87ca..b224a72e787 100644 --- a/src/Storages/System/StorageSystemRemoteDataPaths.cpp +++ b/src/Storages/System/StorageSystemRemoteDataPaths.cpp @@ -1,7 +1,7 @@ #include "StorageSystemRemoteDataPaths.h" #include #include -#include +#include #include #include #include From 081cd4938ad2f3f8a1cddcbe42d9dc2698e37abf Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Tue, 28 Jun 2022 03:35:37 +0800 Subject: [PATCH 128/164] fix style --- src/Common/FileCache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/FileCache.h b/src/Common/FileCache.h index bccd2dbd458..4d2654d1491 100644 --- a/src/Common/FileCache.h +++ b/src/Common/FileCache.h @@ -24,7 +24,7 @@ namespace DB /** * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. - */ + */ class FileCache : private boost::noncopyable { friend class FileSegment; From 61b580aba449a7d7bb756659c287c3f8fbfc2564 Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Tue, 28 Jun 2022 03:50:44 +0800 Subject: [PATCH 129/164] add note --- src/Common/FileCache.h | 5 ++--- src/Common/IFileCachePriority.h | 3 +++ src/Common/LRUFileCachePriority.h | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/Common/FileCache.h b/src/Common/FileCache.h index 4d2654d1491..f809f57f389 100644 --- a/src/Common/FileCache.h +++ b/src/Common/FileCache.h @@ -22,9 +22,8 @@ namespace DB { -/** - * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. - */ +/// Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. +/// Different caching algorithms are implemented based on IFileCachePriority. class FileCache : private boost::noncopyable { friend class FileSegment; diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index 84e1a386d24..568b778d296 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -61,6 +61,7 @@ public: virtual size_t hits() const = 0; + /// Point the iterator to the next higher priority cache record. virtual void next() const = 0; virtual bool valid() const = 0; @@ -72,6 +73,8 @@ public: /// Deletes an existing cached record. virtual void remove(std::lock_guard &) = 0; + /// Get an iterator to handle write operations. Write iterators should only + /// be allowed to call remove, use and incrementSize methods. virtual WriteIterator getWriteIterator() const = 0; virtual void incrementSize(size_t, std::lock_guard &) = 0; diff --git a/src/Common/LRUFileCachePriority.h b/src/Common/LRUFileCachePriority.h index bc9badc0af6..10ad21672dd 100644 --- a/src/Common/LRUFileCachePriority.h +++ b/src/Common/LRUFileCachePriority.h @@ -5,8 +5,8 @@ namespace DB { -/// Based on the LRU algorithm implementation, the data with the lowest priority is stored at -/// the head of the queue, and the data with the highest priority is stored at the tail. +/// Based on the LRU algorithm implementation, the record with the lowest priority is stored at +/// the head of the queue, and the record with the highest priority is stored at the tail. class LRUFileCachePriority : public IFileCachePriority { public: From 164fa1ab0e5a76c5e124df11d5fb12345c0c131a Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Tue, 28 Jun 2022 14:36:41 +0800 Subject: [PATCH 130/164] fix build and style --- src/Common/FileCache.cpp | 4 +--- src/Common/FileCache.h | 2 +- src/Common/IFileCachePriority.h | 9 ++------- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index 2a2fc68e768..8358504c70e 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -234,7 +234,7 @@ void FileCache::initialize() } void FileCache::useCell( - const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock) + const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock) const { auto file_segment = cell.file_segment; @@ -945,8 +945,6 @@ void FileCache::removeIfReleasable(bool remove_persistent_files) || remove_persistent_files || allow_to_remove_persistent_segments_from_cache_by_default)) { - std::lock_guard segment_lock(file_segment->mutex); - file_segment->detach(cache_lock, segment_lock); to_remove.emplace_back(file_segment); } } diff --git a/src/Common/FileCache.h b/src/Common/FileCache.h index f809f57f389..8c6a9396b43 100644 --- a/src/Common/FileCache.h +++ b/src/Common/FileCache.h @@ -243,7 +243,7 @@ private: bool is_persistent, std::lock_guard & cache_lock); - void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock); + void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock) const; bool tryReserveForMainList( const Key & key, size_t offset, size_t size, QueryContextPtr query_context, std::lock_guard & cache_lock); diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index 568b778d296..df3ffd9fd9c 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -11,13 +11,7 @@ namespace DB { -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - class FileCache; - class IFileCachePriority; using FileCachePriorityPtr = std::shared_ptr; @@ -66,7 +60,7 @@ public: virtual bool valid() const = 0; - /// Mark a cache record as recently used, it will update the priority + /// Mark a cache record as recently used, it will update the priority /// of the cache record according to different cache algorithms. virtual void use(std::lock_guard &) = 0; @@ -79,6 +73,7 @@ public: virtual void incrementSize(size_t, std::lock_guard &) = 0; + /// Repoint the iterator to the record with the lowest priority. virtual void seekToLowestPriority() const = 0; }; From 1b01cc8ed943c0053cebe98a7464636f2b643c33 Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Wed, 29 Jun 2022 17:44:38 +0800 Subject: [PATCH 131/164] fix --- src/Common/FileCache.cpp | 38 +++++++++++++------------- src/Common/FileCacheType.h | 5 +++- src/Common/IFileCachePriority.h | 26 ++++++++---------- src/Common/LRUFileCachePriority.h | 45 ++++++++++++++++++++++--------- 4 files changed, 67 insertions(+), 47 deletions(-) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index 8358504c70e..818cc0c1b76 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include #include @@ -40,11 +39,6 @@ FileCache::FileCache( { } -String FileCache::Key::toString() const -{ - return getHexUIntLowercase(key); -} - FileCache::Key FileCache::hash(const String & path) { return Key(sipHash128(path.data(), path.size())); @@ -323,8 +317,11 @@ FileSegments FileCache::getImpl( if (range.left <= prev_cell_range.right) { + /// segment{k-1} segment{k} /// [________] [_____ /// [___________ + /// ^ + /// range.left useCell(prev_cell, result, cache_lock); } } @@ -562,7 +559,7 @@ FileCache::FileSegmentCell * FileCache::addCell( if (stash_priority->getElementsNum(cache_lock) > max_stash_element_size) { - auto remove_priority_iter = stash_priority->getNewIterator(cache_lock)->getWriteIterator(); + auto remove_priority_iter = stash_priority->getLowestPriorityWriteIterator(cache_lock); stash_records.erase({remove_priority_iter->key(), remove_priority_iter->offset()}); remove_priority_iter->remove(cache_lock); } @@ -648,7 +645,7 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc auto * cell_for_reserve = getCell(key, offset, cache_lock); - std::vector ghost; + std::vector> ghost; std::vector trash; std::vector to_evict; @@ -660,7 +657,7 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc }; /// Select the cache from the LRU queue held by query for expulsion. - for (auto iter = query_context->getPriority()->getNewIterator(cache_lock); iter->valid(); iter->next()) + for (auto iter = query_context->getPriority()->getLowestPriorityWriteIterator(cache_lock); iter->valid();) { if (!is_overflow()) break; @@ -671,8 +668,10 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc { /// The cache corresponding to this record may be swapped out by /// other queries, so it has become invalid. - ghost.push_back(iter->getWriteIterator()); removed_size += iter->size(); + ghost.push_back({iter->key(), iter->offset(), iter->size()}); + /// next() + iter->remove(cache_lock); } else { @@ -700,6 +699,8 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc removed_size += cell_size; --queue_size; } + + iter->next(); } } @@ -718,8 +719,8 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc remove_file_segment(file_segment, cell->size()); } - for (auto & iter : ghost) - query_context->remove(iter->key(), iter->offset(), iter->size(), cache_lock); + for (auto & entry : ghost) + query_context->remove(std::get<0>(entry), std::get<1>(entry), std::get<2>(entry), cache_lock); if (is_overflow()) return false; @@ -770,7 +771,7 @@ bool FileCache::tryReserveForMainList( std::vector to_evict; std::vector trash; - for (auto it = main_priority->getNewIterator(cache_lock); it->valid(); it->next()) + for (auto it = main_priority->getLowestPriorityReadIterator(cache_lock); it->valid(); it->next()) { auto entry_key = it->key(); auto entry_offset = it->offset(); @@ -926,9 +927,9 @@ void FileCache::removeIfReleasable(bool remove_persistent_files) std::lock_guard cache_lock(mutex); std::vector to_remove; - for (auto it = main_priority->getNewIterator(cache_lock); it->valid(); it->next()) + for (auto it = main_priority->getLowestPriorityReadIterator(cache_lock); it->valid(); it->next()) { - auto key = it->key(); + const auto & key = it->key(); auto offset = it->offset(); auto * cell = getCell(key, offset, cache_lock); @@ -1247,7 +1248,7 @@ String FileCache::dumpStructure(const Key & key) return dumpStructureUnlocked(key, cache_lock); } -String FileCache::dumpStructureUnlocked(const Key & key, std::lock_guard & cache_lock) +String FileCache::dumpStructureUnlocked(const Key & key, std::lock_guard &) { WriteBufferFromOwnString result; const auto & cells_by_offset = files[key]; @@ -1255,7 +1256,6 @@ String FileCache::dumpStructureUnlocked(const Key & key, std::lock_guardgetInfoForLog() << "\n"; - result << "\n\nPriority: " << main_priority->toString(cache_lock); return result.str(); } @@ -1291,9 +1291,9 @@ void FileCache::assertCacheCorrectness(std::lock_guard & cache_lock) void FileCache::assertPriorityCorrectness(std::lock_guard & cache_lock) { [[maybe_unused]] size_t total_size = 0; - for (auto it = main_priority->getNewIterator(cache_lock); it->valid(); it->next()) + for (auto it = main_priority->getLowestPriorityReadIterator(cache_lock); it->valid(); it->next()) { - auto key = it->key(); + const auto & key = it->key(); auto offset = it->offset(); auto size = it->size(); diff --git a/src/Common/FileCacheType.h b/src/Common/FileCacheType.h index 9b3ec5a6af0..cf4ab5d20c5 100644 --- a/src/Common/FileCacheType.h +++ b/src/Common/FileCacheType.h @@ -1,5 +1,6 @@ #pragma once #include +#include namespace DB { @@ -7,9 +8,11 @@ namespace DB struct FileCacheKey { UInt128 key; - String toString() const; + + String toString() const { return getHexUIntLowercase(key); } FileCacheKey() = default; + explicit FileCacheKey(const UInt128 & key_) : key(key_) { } bool operator==(const FileCacheKey & other) const { return key == other.key; } diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index df3ffd9fd9c..8b448a5ef9d 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -21,13 +21,13 @@ class IFileCachePriority public: class IIterator; friend class IIterator; + friend class FileCache; + + using Key = FileCacheKey; using ReadIterator = std::shared_ptr; using WriteIterator = std::shared_ptr; - friend class FileCache; - using Key = FileCacheKey; - struct FileCacheRecord { Key key; @@ -47,7 +47,7 @@ public: public: virtual ~IIterator() = default; - virtual Key key() const = 0; + virtual const Key & key() const = 0; virtual size_t offset() const = 0; @@ -64,17 +64,11 @@ public: /// of the cache record according to different cache algorithms. virtual void use(std::lock_guard &) = 0; - /// Deletes an existing cached record. + /// Deletes an existing cached record. And to avoid pointer suspension + /// the iterator should automatically point to the next record. virtual void remove(std::lock_guard &) = 0; - /// Get an iterator to handle write operations. Write iterators should only - /// be allowed to call remove, use and incrementSize methods. - virtual WriteIterator getWriteIterator() const = 0; - virtual void incrementSize(size_t, std::lock_guard &) = 0; - - /// Repoint the iterator to the record with the lowest priority. - virtual void seekToLowestPriority() const = 0; }; public: @@ -84,6 +78,7 @@ public: /// logical exception if the cache block already exists. virtual WriteIterator add(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) = 0; + /// This method is used for assertions in debug mode. So we do not care about complexity here. /// Query whether a cache record exists. If it exists, return true. If not, return false. virtual bool contains(const Key & key, size_t offset, std::lock_guard & cache_lock) = 0; @@ -91,14 +86,15 @@ public: /// Returns an iterator pointing to the lowest priority cached record. /// We can traverse all cached records through the iterator's next(). - virtual ReadIterator getNewIterator(std::lock_guard & cache_lock) = 0; + virtual ReadIterator getLowestPriorityReadIterator(std::lock_guard & cache_lock) = 0; + + /// The same as getLowestPriorityReadIterator(), but it is writeable. + virtual WriteIterator getLowestPriorityWriteIterator(std::lock_guard & cache_lock) = 0; virtual size_t getElementsNum(std::lock_guard & cache_lock) const = 0; size_t getCacheSize(std::lock_guard &) const { return cache_size; } - virtual std::string toString(std::lock_guard & cache_lock) const = 0; - protected: size_t max_cache_size = 0; size_t cache_size = 0; diff --git a/src/Common/LRUFileCachePriority.h b/src/Common/LRUFileCachePriority.h index 10ad21672dd..ecbe2b47bd8 100644 --- a/src/Common/LRUFileCachePriority.h +++ b/src/Common/LRUFileCachePriority.h @@ -5,11 +5,16 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + /// Based on the LRU algorithm implementation, the record with the lowest priority is stored at /// the head of the queue, and the record with the highest priority is stored at the tail. class LRUFileCachePriority : public IFileCachePriority { -public: +private: using LRUQueue = std::list; using LRUQueueIterator = typename LRUQueue::iterator; @@ -23,9 +28,9 @@ public: void next() const override { queue_iter++; } - bool valid() const override { return (file_cache->queue.size() && (queue_iter != file_cache->queue.end())); } + bool valid() const override { return queue_iter != file_cache->queue.end(); } - Key key() const override { return queue_iter->key; } + const Key & key() const override { return queue_iter->key; } size_t offset() const override { return queue_iter->offset; } @@ -33,14 +38,12 @@ public: size_t hits() const override { return queue_iter->hits; } - WriteIterator getWriteIterator() const override { return std::make_shared(file_cache, queue_iter); } - - void seekToLowestPriority() const override { queue_iter = file_cache->queue.begin(); } - void remove(std::lock_guard &) override { - file_cache->cache_size -= queue_iter->size; - file_cache->queue.erase(queue_iter); + auto remove_iter = queue_iter; + queue_iter++; + file_cache->cache_size -= remove_iter->size; + file_cache->queue.erase(remove_iter); } void incrementSize(size_t size_increment, std::lock_guard &) override @@ -65,6 +68,18 @@ public: WriteIterator add(const Key & key, size_t offset, size_t size, std::lock_guard &) override { +#ifndef NDEBUG + for (const auto & entry : queue) + { + if (entry.key() == key && entry.offset() == offset) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Attempt to add duplicate queue entry to queue. (Key: {}, offset: {}, size: {})", + entry.key().toString(), + entry.offset(), + entry.size()); + } +#endif auto iter = queue.insert(queue.end(), FileCacheRecord(key, offset, size)); cache_size += size; return std::make_shared(this, iter); @@ -86,14 +101,20 @@ public: cache_size = 0; } - ReadIterator getNewIterator(std::lock_guard &) override + ReadIterator getLowestPriorityReadIterator(std::lock_guard &) override { return std::make_shared(this, queue.begin()); } - size_t getElementsNum(std::lock_guard &) const override { return queue.size(); } + WriteIterator getLowestPriorityWriteIterator(std::lock_guard &) override + { + return std::make_shared(this, queue.begin()); + } - std::string toString(std::lock_guard &) const override { return {}; } + size_t getElementsNum(std::lock_guard &) const override + { + return queue.size(); + } private: LRUQueue queue; From d2b5581632e8025a605d7832212d689a4fd85266 Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Wed, 29 Jun 2022 21:28:19 +0800 Subject: [PATCH 132/164] fix --- src/Common/FileCache.cpp | 2 +- src/Common/IFileCachePriority.h | 1 - src/Common/LRUFileCachePriority.cpp | 62 ++++++++++++ src/Common/LRUFileCachePriority.h | 146 ++++++++++------------------ 4 files changed, 115 insertions(+), 96 deletions(-) create mode 100644 src/Common/LRUFileCachePriority.cpp diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index 818cc0c1b76..f49f441969c 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -773,7 +773,7 @@ bool FileCache::tryReserveForMainList( for (auto it = main_priority->getLowestPriorityReadIterator(cache_lock); it->valid(); it->next()) { - auto entry_key = it->key(); + const auto & entry_key = it->key(); auto entry_offset = it->offset(); if (!is_overflow()) diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index 8b448a5ef9d..691f23a1b54 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -24,7 +24,6 @@ public: friend class FileCache; using Key = FileCacheKey; - using ReadIterator = std::shared_ptr; using WriteIterator = std::shared_ptr; diff --git a/src/Common/LRUFileCachePriority.cpp b/src/Common/LRUFileCachePriority.cpp new file mode 100644 index 00000000000..ed13e58e5fd --- /dev/null +++ b/src/Common/LRUFileCachePriority.cpp @@ -0,0 +1,62 @@ +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +IFileCachePriority::WriteIterator +LRUFileCachePriority::add(const Key & key, size_t offset, size_t size, std::lock_guard &) override +{ +#ifndef NDEBUG + for (const auto & entry : queue) + { + if (entry.key == key && entry.offset == offset) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Attempt to add duplicate queue entry to queue. (Key: {}, offset: {}, size: {})", + entry.key.toString(), + entry.offset, + entry.size); + } +#endif + auto iter = queue.insert(queue.end(), FileCacheRecord(key, offset, size)); + cache_size += size; + return std::make_shared(this, iter); +} + +bool LRUFileCachePriority::contains(const Key & key, size_t offset, std::lock_guard &) override +{ + for (const auto & record : queue) + { + if (key == record.key && offset == record.offset) + return true; + } + return false; +} + +void LRUFileCachePriority::removeAll(std::lock_guard &) override +{ + queue.clear(); + cache_size = 0; +} + +IFileCachePriority::ReadIterator LRUFileCachePriority::getLowestPriorityReadIterator(std::lock_guard &) override +{ + return std::make_shared(this, queue.begin()); +} + +IFileCachePriority::WriteIterator LRUFileCachePriority::getLowestPriorityWriteIterator(std::lock_guard &) override +{ + return std::make_shared(this, queue.begin()); +} + +size_t LRUFileCachePriority::getElementsNum(std::lock_guard &) const override +{ + return queue.size(); +} + +}; diff --git a/src/Common/LRUFileCachePriority.h b/src/Common/LRUFileCachePriority.h index ecbe2b47bd8..20d1e05b9f0 100644 --- a/src/Common/LRUFileCachePriority.h +++ b/src/Common/LRUFileCachePriority.h @@ -5,11 +5,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - /// Based on the LRU algorithm implementation, the record with the lowest priority is stored at /// the head of the queue, and the record with the highest priority is stored at the tail. class LRUFileCachePriority : public IFileCachePriority @@ -17,107 +12,70 @@ class LRUFileCachePriority : public IFileCachePriority private: using LRUQueue = std::list; using LRUQueueIterator = typename LRUQueue::iterator; - - class LRUFileCacheIterator : public IIterator - { - public: - LRUFileCacheIterator(LRUFileCachePriority * file_cache_, LRUQueueIterator queue_iter_) - : file_cache(file_cache_), queue_iter(queue_iter_) - { - } - - void next() const override { queue_iter++; } - - bool valid() const override { return queue_iter != file_cache->queue.end(); } - - const Key & key() const override { return queue_iter->key; } - - size_t offset() const override { return queue_iter->offset; } - - size_t size() const override { return queue_iter->size; } - - size_t hits() const override { return queue_iter->hits; } - - void remove(std::lock_guard &) override - { - auto remove_iter = queue_iter; - queue_iter++; - file_cache->cache_size -= remove_iter->size; - file_cache->queue.erase(remove_iter); - } - - void incrementSize(size_t size_increment, std::lock_guard &) override - { - file_cache->cache_size += size_increment; - queue_iter->size += size_increment; - } - - void use(std::lock_guard &) override - { - queue_iter->hits++; - file_cache->queue.splice(file_cache->queue.end(), file_cache->queue, queue_iter); - } - - private: - mutable LRUFileCachePriority * file_cache; - mutable LRUQueueIterator queue_iter; - }; + class LRUFileCacheIterator; public: LRUFileCachePriority() = default; - WriteIterator add(const Key & key, size_t offset, size_t size, std::lock_guard &) override - { -#ifndef NDEBUG - for (const auto & entry : queue) - { - if (entry.key() == key && entry.offset() == offset) - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Attempt to add duplicate queue entry to queue. (Key: {}, offset: {}, size: {})", - entry.key().toString(), - entry.offset(), - entry.size()); - } -#endif - auto iter = queue.insert(queue.end(), FileCacheRecord(key, offset, size)); - cache_size += size; - return std::make_shared(this, iter); - } + WriteIterator add(const Key & key, size_t offset, size_t size, std::lock_guard &) override; - bool contains(const Key & key, size_t offset, std::lock_guard &) override - { - for (const auto & record : queue) - { - if (key == record.key && offset == record.offset) - return true; - } - return false; - } + bool contains(const Key & key, size_t offset, std::lock_guard &) override; - void removeAll(std::lock_guard &) override - { - queue.clear(); - cache_size = 0; - } + void removeAll(std::lock_guard &) override; - ReadIterator getLowestPriorityReadIterator(std::lock_guard &) override - { - return std::make_shared(this, queue.begin()); - } + ReadIterator getLowestPriorityReadIterator(std::lock_guard &) override; - WriteIterator getLowestPriorityWriteIterator(std::lock_guard &) override - { - return std::make_shared(this, queue.begin()); - } + WriteIterator getLowestPriorityWriteIterator(std::lock_guard &) override; - size_t getElementsNum(std::lock_guard &) const override - { - return queue.size(); - } + size_t getElementsNum(std::lock_guard &) const override; private: LRUQueue queue; }; +class LRUFileCachePriority::LRUFileCacheIterator : public IFileCachePriority::IIterator +{ +public: + LRUFileCacheIterator(LRUFileCachePriority * file_cache_, LRUFileCachePriority::LRUQueueIterator queue_iter_) + : file_cache(file_cache_), queue_iter(queue_iter_) + { + } + + void next() const override { queue_iter++; } + + bool valid() const override { return queue_iter != file_cache->queue.end(); } + + const Key & key() const override { return queue_iter->key; } + + size_t offset() const override { return queue_iter->offset; } + + size_t size() const override { return queue_iter->size; } + + size_t hits() const override { return queue_iter->hits; } + + void remove(std::lock_guard &) override + { + auto remove_iter = queue_iter; + queue_iter++; + file_cache->cache_size -= remove_iter->size; + file_cache->queue.erase(remove_iter); + } + + void incrementSize(size_t size_increment, std::lock_guard &) override + { + file_cache->cache_size += size_increment; + queue_iter->size += size_increment; + } + + void use(std::lock_guard &) override + { + queue_iter->hits++; + file_cache->queue.splice(file_cache->queue.end(), file_cache->queue, queue_iter); + } + +private: + mutable LRUFileCachePriority * file_cache; + mutable LRUFileCachePriority::LRUQueueIterator queue_iter; +}; + }; From f6a58bff4ca1ef3ada0b0b942f675a812dec6e47 Mon Sep 17 00:00:00 2001 From: KinderRiven <1339764596@qq.com> Date: Wed, 29 Jun 2022 22:45:34 +0800 Subject: [PATCH 133/164] fix build --- src/Common/IFileCachePriority.h | 6 ------ src/Common/LRUFileCachePriority.cpp | 13 ++++++------- src/Common/LRUFileCachePriority.h | 3 ++- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index 691f23a1b54..fe925fd275d 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -1,17 +1,14 @@ #pragma once -#include #include #include #include #include -#include #include namespace DB { -class FileCache; class IFileCachePriority; using FileCachePriorityPtr = std::shared_ptr; @@ -20,9 +17,6 @@ class IFileCachePriority { public: class IIterator; - friend class IIterator; - friend class FileCache; - using Key = FileCacheKey; using ReadIterator = std::shared_ptr; using WriteIterator = std::shared_ptr; diff --git a/src/Common/LRUFileCachePriority.cpp b/src/Common/LRUFileCachePriority.cpp index ed13e58e5fd..c54b65f6ee0 100644 --- a/src/Common/LRUFileCachePriority.cpp +++ b/src/Common/LRUFileCachePriority.cpp @@ -8,8 +8,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -IFileCachePriority::WriteIterator -LRUFileCachePriority::add(const Key & key, size_t offset, size_t size, std::lock_guard &) override +IFileCachePriority::WriteIterator LRUFileCachePriority::add(const Key & key, size_t offset, size_t size, std::lock_guard &) { #ifndef NDEBUG for (const auto & entry : queue) @@ -28,7 +27,7 @@ LRUFileCachePriority::add(const Key & key, size_t offset, size_t size, std::lock return std::make_shared(this, iter); } -bool LRUFileCachePriority::contains(const Key & key, size_t offset, std::lock_guard &) override +bool LRUFileCachePriority::contains(const Key & key, size_t offset, std::lock_guard &) { for (const auto & record : queue) { @@ -38,23 +37,23 @@ bool LRUFileCachePriority::contains(const Key & key, size_t offset, std::lock_gu return false; } -void LRUFileCachePriority::removeAll(std::lock_guard &) override +void LRUFileCachePriority::removeAll(std::lock_guard &) { queue.clear(); cache_size = 0; } -IFileCachePriority::ReadIterator LRUFileCachePriority::getLowestPriorityReadIterator(std::lock_guard &) override +IFileCachePriority::ReadIterator LRUFileCachePriority::getLowestPriorityReadIterator(std::lock_guard &) { return std::make_shared(this, queue.begin()); } -IFileCachePriority::WriteIterator LRUFileCachePriority::getLowestPriorityWriteIterator(std::lock_guard &) override +IFileCachePriority::WriteIterator LRUFileCachePriority::getLowestPriorityWriteIterator(std::lock_guard &) { return std::make_shared(this, queue.begin()); } -size_t LRUFileCachePriority::getElementsNum(std::lock_guard &) const override +size_t LRUFileCachePriority::getElementsNum(std::lock_guard &) const { return queue.size(); } diff --git a/src/Common/LRUFileCachePriority.h b/src/Common/LRUFileCachePriority.h index 20d1e05b9f0..250a55480f9 100644 --- a/src/Common/LRUFileCachePriority.h +++ b/src/Common/LRUFileCachePriority.h @@ -1,5 +1,6 @@ #pragma once +#include #include namespace DB @@ -10,9 +11,9 @@ namespace DB class LRUFileCachePriority : public IFileCachePriority { private: + class LRUFileCacheIterator; using LRUQueue = std::list; using LRUQueueIterator = typename LRUQueue::iterator; - class LRUFileCacheIterator; public: LRUFileCachePriority() = default; From fbaa70b3130e1ae66ec0fdefed5f2921dc55c9cf Mon Sep 17 00:00:00 2001 From: KinderRiven Date: Thu, 28 Jul 2022 13:23:57 +0800 Subject: [PATCH 134/164] fix --- src/Common/FileCache.cpp | 4 ++-- src/Disks/ObjectStorages/LocalObjectStorage.cpp | 2 +- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index f49f441969c..2d13c23e837 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -63,7 +63,7 @@ static bool isQueryInitialized() { return CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() - && CurrentThread::getQueryId().size != 0; + && CurrentThread::getQueryId().size() != 0; } bool FileCache::isReadOnly() @@ -82,7 +82,7 @@ FileCache::QueryContextPtr FileCache::getCurrentQueryContext(std::lock_guard & /* cache_lock */) diff --git a/src/Disks/ObjectStorages/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/LocalObjectStorage.cpp index c052f2f0d77..af6dab0b8a6 100644 --- a/src/Disks/ObjectStorages/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/LocalObjectStorage.cpp @@ -1,7 +1,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 901deeebefc..e017e19c06c 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -128,7 +128,7 @@ void S3ObjectStorage::removeCacheIfExists(const std::string & path_key) if (!cache || path_key.empty()) return; - IFileCache::Key key = cache->hash(path_key); + FileCache::Key key = cache->hash(path_key); cache->removeIfExists(key); } @@ -500,7 +500,7 @@ ReadSettings S3ObjectStorage::patchSettings(const ReadSettings & read_settings) ReadSettings settings{read_settings}; if (cache) { - if (IFileCache::isReadOnly()) + if (FileCache::isReadOnly()) settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache = true; settings.remote_fs_cache = cache; From 76e0aad69e361e9a25823b7c9287b785a5111517 Mon Sep 17 00:00:00 2001 From: KinderRiven Date: Tue, 9 Aug 2022 20:21:57 +0800 Subject: [PATCH 135/164] fix --- src/Common/FileCache.cpp | 32 ++++++++++++------- src/Common/FileCache.h | 4 +-- src/Common/IFileCachePriority.h | 2 +- src/Common/LRUFileCachePriority.h | 10 +++--- src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 2 +- 5 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index 2d13c23e837..d97d20310c8 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -30,8 +30,8 @@ FileCache::FileCache( , max_element_size(cache_settings_.max_elements) , max_file_segment_size(cache_settings_.max_file_segment_size) , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit) - , main_priority(std::make_shared()) - , stash_priority(std::make_shared()) + , main_priority(std::make_unique()) + , stash_priority(std::make_unique()) , max_stash_element_size(cache_settings_.max_elements) , enable_cache_hits_threshold(cache_settings_.enable_cache_hits_threshold) , log(&Poco::Logger::get("FileCache")) @@ -145,7 +145,7 @@ void FileCache::QueryContext::remove(const Key & key, size_t offset, size_t size auto record = records.find({key, offset}); if (record != records.end()) { - record->second->remove(cache_lock); + record->second->removeAndGetNext(cache_lock); records.erase({key, offset}); } } @@ -561,7 +561,7 @@ FileCache::FileSegmentCell * FileCache::addCell( { auto remove_priority_iter = stash_priority->getLowestPriorityWriteIterator(cache_lock); stash_records.erase({remove_priority_iter->key(), remove_priority_iter->offset()}); - remove_priority_iter->remove(cache_lock); + remove_priority_iter->removeAndGetNext(cache_lock); } /// For segments that do not reach the download threshold, we do not download them, but directly read them result_state = FileSegment::State::SKIP_CACHE; @@ -645,7 +645,17 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc auto * cell_for_reserve = getCell(key, offset, cache_lock); - std::vector> ghost; + struct Segment + { + Key key; + size_t offset; + size_t size; + + Segment(Key key_, size_t offset_, size_t size_) + : key(key_), offset(offset_), size(size_) {} + }; + + std::vector ghost; std::vector trash; std::vector to_evict; @@ -669,9 +679,9 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc /// The cache corresponding to this record may be swapped out by /// other queries, so it has become invalid. removed_size += iter->size(); - ghost.push_back({iter->key(), iter->offset(), iter->size()}); + ghost.push_back(Segment(iter->key(), iter->offset(), iter->size())); /// next() - iter->remove(cache_lock); + iter->removeAndGetNext(cache_lock); } else { @@ -720,7 +730,7 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc } for (auto & entry : ghost) - query_context->remove(std::get<0>(entry), std::get<1>(entry), std::get<2>(entry), cache_lock); + query_context->remove(entry.key, entry.offset, entry.size, cache_lock); if (is_overflow()) return false; @@ -926,7 +936,7 @@ void FileCache::removeIfReleasable(bool remove_persistent_files) std::lock_guard cache_lock(mutex); - std::vector to_remove; + std::vector to_remove; for (auto it = main_priority->getLowestPriorityReadIterator(cache_lock); it->valid(); it->next()) { const auto & key = it->key(); @@ -946,7 +956,7 @@ void FileCache::removeIfReleasable(bool remove_persistent_files) || remove_persistent_files || allow_to_remove_persistent_segments_from_cache_by_default)) { - to_remove.emplace_back(file_segment); + to_remove.emplace_back(file_segment.get()); } } } @@ -981,7 +991,7 @@ void FileCache::remove( if (cell->queue_iterator) { - cell->queue_iterator->remove(cache_lock); + cell->queue_iterator->removeAndGetNext(cache_lock); } auto & offsets = files[key]; diff --git a/src/Common/FileCache.h b/src/Common/FileCache.h index 8c6a9396b43..7a25632be68 100644 --- a/src/Common/FileCache.h +++ b/src/Common/FileCache.h @@ -220,10 +220,10 @@ private: using CachedFiles = std::unordered_map; CachedFiles files; - FileCachePriorityPtr main_priority; + std::unique_ptr main_priority; FileCacheRecords stash_records; - FileCachePriorityPtr stash_priority; + std::unique_ptr stash_priority; size_t max_stash_element_size; size_t enable_cache_hits_threshold; diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index fe925fd275d..59ce3c0aebb 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -59,7 +59,7 @@ public: /// Deletes an existing cached record. And to avoid pointer suspension /// the iterator should automatically point to the next record. - virtual void remove(std::lock_guard &) = 0; + virtual void removeAndGetNext(std::lock_guard &) = 0; virtual void incrementSize(size_t, std::lock_guard &) = 0; }; diff --git a/src/Common/LRUFileCachePriority.h b/src/Common/LRUFileCachePriority.h index 250a55480f9..0f5755e1cb8 100644 --- a/src/Common/LRUFileCachePriority.h +++ b/src/Common/LRUFileCachePriority.h @@ -54,12 +54,10 @@ public: size_t hits() const override { return queue_iter->hits; } - void remove(std::lock_guard &) override + void removeAndGetNext(std::lock_guard &) override { - auto remove_iter = queue_iter; - queue_iter++; - file_cache->cache_size -= remove_iter->size; - file_cache->queue.erase(remove_iter); + file_cache->cache_size -= queue_iter->size; + queue_iter = file_cache->queue.erase(queue_iter); } void incrementSize(size_t size_increment, std::lock_guard &) override @@ -75,7 +73,7 @@ public: } private: - mutable LRUFileCachePriority * file_cache; + LRUFileCachePriority * file_cache; mutable LRUFileCachePriority::LRUQueueIterator queue_iter; }; diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 3ac4ea07945..f21e2bd7642 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -35,7 +35,7 @@ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather( with_cache = settings.remote_fs_cache && settings.enable_filesystem_cache - && (!IFileCache::isReadOnly() || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache); + && (!FileCache::isReadOnly() || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache); } SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const String & path, size_t file_size) From 2ae02a49214f86ff2f00e59a6d599e21db7dc2ee Mon Sep 17 00:00:00 2001 From: KinderRiven Date: Tue, 9 Aug 2022 20:38:49 +0800 Subject: [PATCH 136/164] fix style --- src/Common/FileCache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index d97d20310c8..2c8e62ac124 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -651,7 +651,7 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc size_t offset; size_t size; - Segment(Key key_, size_t offset_, size_t size_) + Segment(Key key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {} }; From 9ba94e64f97d738e17c0d99dfb5ed7c556ac9956 Mon Sep 17 00:00:00 2001 From: KinderRiven Date: Wed, 10 Aug 2022 16:11:06 +0800 Subject: [PATCH 137/164] fix --- src/Common/FileCache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index 2c8e62ac124..5a0145f0018 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -936,7 +936,7 @@ void FileCache::removeIfReleasable(bool remove_persistent_files) std::lock_guard cache_lock(mutex); - std::vector to_remove; + std::vector to_remove; for (auto it = main_priority->getLowestPriorityReadIterator(cache_lock); it->valid(); it->next()) { const auto & key = it->key(); @@ -956,7 +956,7 @@ void FileCache::removeIfReleasable(bool remove_persistent_files) || remove_persistent_files || allow_to_remove_persistent_segments_from_cache_by_default)) { - to_remove.emplace_back(file_segment.get()); + to_remove.emplace_back(file_segment); } } } From 9b7f87677dd4fa8f09c4c58c90630941a4d97746 Mon Sep 17 00:00:00 2001 From: KinderRiven Date: Wed, 10 Aug 2022 22:04:43 +0800 Subject: [PATCH 138/164] fix --- src/Common/FileCache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index 5a0145f0018..47b7d57ae66 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -63,7 +63,7 @@ static bool isQueryInitialized() { return CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() - && CurrentThread::getQueryId().size() != 0; + && !CurrentThread::getQueryId().empty(); } bool FileCache::isReadOnly() From 1aa7bbcbbd8b8c0bbd4f4dca6bc35ebde7837945 Mon Sep 17 00:00:00 2001 From: KinderRiven Date: Wed, 10 Aug 2022 23:19:26 +0800 Subject: [PATCH 139/164] fix unique_ptr --- src/Common/IFileCachePriority.h | 2 +- src/Common/LRUFileCachePriority.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/IFileCachePriority.h b/src/Common/IFileCachePriority.h index 59ce3c0aebb..f80266f9eea 100644 --- a/src/Common/IFileCachePriority.h +++ b/src/Common/IFileCachePriority.h @@ -18,7 +18,7 @@ class IFileCachePriority public: class IIterator; using Key = FileCacheKey; - using ReadIterator = std::shared_ptr; + using ReadIterator = std::unique_ptr; using WriteIterator = std::shared_ptr; struct FileCacheRecord diff --git a/src/Common/LRUFileCachePriority.cpp b/src/Common/LRUFileCachePriority.cpp index c54b65f6ee0..b4c4bfa338b 100644 --- a/src/Common/LRUFileCachePriority.cpp +++ b/src/Common/LRUFileCachePriority.cpp @@ -45,7 +45,7 @@ void LRUFileCachePriority::removeAll(std::lock_guard &) IFileCachePriority::ReadIterator LRUFileCachePriority::getLowestPriorityReadIterator(std::lock_guard &) { - return std::make_shared(this, queue.begin()); + return std::make_unique(this, queue.begin()); } IFileCachePriority::WriteIterator LRUFileCachePriority::getLowestPriorityWriteIterator(std::lock_guard &) From bb46bfa6d9e28cf4f48c9796e86f09a22314c07a Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 10 Aug 2022 21:20:52 +0300 Subject: [PATCH 140/164] Update process_functional_tests_result.py --- docker/test/util/process_functional_tests_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/util/process_functional_tests_result.py b/docker/test/util/process_functional_tests_result.py index 647989e8421..28f3e211157 100755 --- a/docker/test/util/process_functional_tests_result.py +++ b/docker/test/util/process_functional_tests_result.py @@ -86,7 +86,7 @@ def process_test_log(log_path): test_end = True test_results = [ - (test[0], test[1], test[2], "".join(test[3]))[:4096] for test in test_results + (test[0], test[1], test[2], "".join(test[3])[:4096]) for test in test_results ] return ( From 9284b9b42f62fb84795652a1b58f656734102966 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 10 Aug 2022 20:25:51 +0200 Subject: [PATCH 141/164] tests: fix 00926_adaptive_index_granularity_pk/00489_pk_subexpression flakiness It is possible for toStartOfMinute() to give different result for 0 and 59, for partial timezones (timezone that does not starts from 00:00, like Africa/Monrovia). Before #36656 it fails for another reason, because of overflows [1], but now it fails because it simply return different minutes. [1]: https://github.com/ClickHouse/ClickHouse/pull/29953#discussion_r800550280 Simply pin the UTC there. Fixes: #37786 Signed-off-by: Azat Khuzhin --- .../0_stateless/00489_pk_subexpression.sql | 15 ++++++++++++++- .../00926_adaptive_index_granularity_pk.sql | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/00489_pk_subexpression.sql b/tests/queries/0_stateless/00489_pk_subexpression.sql index 6f76a13609c..700581a9def 100644 --- a/tests/queries/0_stateless/00489_pk_subexpression.sql +++ b/tests/queries/0_stateless/00489_pk_subexpression.sql @@ -1,7 +1,20 @@ DROP TABLE IF EXISTS pk; set allow_deprecated_syntax_for_merge_tree=1; -CREATE TABLE pk (d Date DEFAULT '2000-01-01', x DateTime, y UInt64, z UInt64) ENGINE = MergeTree(d, (toStartOfMinute(x), y, z), 1); +-- NOTE: here the timezone is pinned to UTC, to avoid issues with "partial +-- timezones" (timezones that does not starts from 00:00), like +-- Africa/Monrovia, for which toStartOfMinute(0) and toStartOfMinute(59) can +-- give different values: +-- +-- SELECT +-- toDateTime(0, 'Africa/Monrovia') AS sec0, +-- toDateTime(59, 'Africa/Monrovia') AS sec59 +-- +-- ┌────────────────sec0─┬───────────────sec59─┐ +-- │ 1969-12-31 23:15:30 │ 1969-12-31 23:16:29 │ +-- └─────────────────────┴─────────────────────┘ +-- +CREATE TABLE pk (d Date DEFAULT '2000-01-01', x DateTime, y UInt64, z UInt64) ENGINE = MergeTree(d, (toStartOfMinute(x, 'UTC'), y, z), 1); INSERT INTO pk (x, y, z) VALUES (1, 11, 1235), (2, 11, 4395), (3, 22, 3545), (4, 22, 6984), (5, 33, 4596), (61, 11, 4563), (62, 11, 4578), (63, 11, 3572), (64, 22, 5786), (65, 22, 5786), (66, 22, 2791), (67, 22, 2791), (121, 33, 2791), (122, 33, 2791), (123, 33, 1235), (124, 44, 4935), (125, 44, 4578), (126, 55, 5786), (127, 55, 2791), (128, 55, 1235); diff --git a/tests/queries/0_stateless/00926_adaptive_index_granularity_pk.sql b/tests/queries/0_stateless/00926_adaptive_index_granularity_pk.sql index ba34cfad299..47e1d3fea0f 100644 --- a/tests/queries/0_stateless/00926_adaptive_index_granularity_pk.sql +++ b/tests/queries/0_stateless/00926_adaptive_index_granularity_pk.sql @@ -4,7 +4,20 @@ SET send_logs_level = 'fatal'; SELECT '----00489----'; DROP TABLE IF EXISTS pk; -CREATE TABLE pk (d Date DEFAULT '2000-01-01', x DateTime, y UInt64, z UInt64) ENGINE = MergeTree() PARTITION BY d ORDER BY (toStartOfMinute(x), y, z) SETTINGS index_granularity_bytes=19, min_index_granularity_bytes=9, write_final_mark = 0; -- one row granule +-- NOTE: here the timezone is pinned to UTC, to avoid issues with "partial +-- timezones" (timezones that does not starts from 00:00), like +-- Africa/Monrovia, for which toStartOfMinute(0) and toStartOfMinute(59) can +-- give different values: +-- +-- SELECT +-- toDateTime(0, 'Africa/Monrovia') AS sec0, +-- toDateTime(59, 'Africa/Monrovia') AS sec59 +-- +-- ┌────────────────sec0─┬───────────────sec59─┐ +-- │ 1969-12-31 23:15:30 │ 1969-12-31 23:16:29 │ +-- └─────────────────────┴─────────────────────┘ +-- +CREATE TABLE pk (d Date DEFAULT '2000-01-01', x DateTime, y UInt64, z UInt64) ENGINE = MergeTree() PARTITION BY d ORDER BY (toStartOfMinute(x, 'UTC'), y, z) SETTINGS index_granularity_bytes=19, min_index_granularity_bytes=9, write_final_mark = 0; -- one row granule INSERT INTO pk (x, y, z) VALUES (1, 11, 1235), (2, 11, 4395), (3, 22, 3545), (4, 22, 6984), (5, 33, 4596), (61, 11, 4563), (62, 11, 4578), (63, 11, 3572), (64, 22, 5786), (65, 22, 5786), (66, 22, 2791), (67, 22, 2791), (121, 33, 2791), (122, 33, 2791), (123, 33, 1235), (124, 44, 4935), (125, 44, 4578), (126, 55, 5786), (127, 55, 2791), (128, 55, 1235); From 9f85d85e089e176ef0b784802048d090c1aa3577 Mon Sep 17 00:00:00 2001 From: Rich Raposa Date: Wed, 10 Aug 2022 13:08:40 -0600 Subject: [PATCH 142/164] The admonitions were missing section endings (#40073) --- docs/ru/development/build-osx.md | 2 ++ docs/ru/engines/table-engines/integrations/postgresql.md | 1 + .../mergetree-family/custom-partitioning-key.md | 2 ++ docs/ru/faq/general/dbms-naming.md | 1 + docs/ru/faq/general/index.md | 1 + docs/ru/faq/general/why-clickhouse-is-so-fast.md | 1 + docs/ru/faq/integration/index.md | 1 + docs/ru/faq/operations/index.md | 1 + docs/ru/getting-started/example-datasets/nyc-taxi.md | 1 + docs/ru/getting-started/example-datasets/ontime.md | 1 + docs/ru/operations/external-authenticators/kerberos.md | 1 + .../operations/server-configuration-parameters/settings.md | 4 ++++ docs/ru/operations/system-tables/mutations.md | 1 + .../sql-reference/aggregate-functions/reference/deltasum.md | 1 + .../aggregate-functions/reference/intervalLengthSum.md | 1 + docs/ru/sql-reference/data-types/nullable.md | 1 + .../external-dictionaries/external-dicts-dict-sources.md | 5 +++++ docs/ru/sql-reference/functions/geo/geohash.md | 2 ++ docs/ru/sql-reference/functions/type-conversion-functions.md | 1 + docs/ru/sql-reference/statements/alter/index.md | 3 ++- docs/ru/sql-reference/statements/create/user.md | 1 + docs/ru/sql-reference/statements/create/view.md | 1 + docs/ru/sql-reference/statements/optimize.md | 1 + docs/ru/sql-reference/statements/show.md | 1 + docs/ru/sql-reference/statements/truncate.md | 1 + docs/ru/sql-reference/statements/watch.md | 1 + docs/ru/sql-reference/table-functions/postgresql.md | 2 ++ 27 files changed, 39 insertions(+), 1 deletion(-) diff --git a/docs/ru/development/build-osx.md b/docs/ru/development/build-osx.md index cc6927bebb8..205edce2b78 100644 --- a/docs/ru/development/build-osx.md +++ b/docs/ru/development/build-osx.md @@ -8,6 +8,7 @@ sidebar_label: Сборка на Mac OS X :::info "Вам не нужно собирать ClickHouse самостоятельно" Вы можете установить предварительно собранный ClickHouse, как описано в [Быстром старте](https://clickhouse.com/#quick-start). Следуйте инструкциям по установке для `macOS (Intel)` или `macOS (Apple Silicon)`. +::: Сборка должна запускаться с x86_64 (Intel) на macOS версии 10.15 (Catalina) и выше в последней версии компилятора Xcode's native AppleClang, Homebrew's vanilla Clang или в GCC-компиляторах. @@ -90,6 +91,7 @@ $ /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/ :::info "Note" Вам понадобится команда `sudo`. +::: 1. Создайте файл `/Library/LaunchDaemons/limit.maxfiles.plist` и поместите в него следующее: diff --git a/docs/ru/engines/table-engines/integrations/postgresql.md b/docs/ru/engines/table-engines/integrations/postgresql.md index f9702930a30..28debaf9c23 100644 --- a/docs/ru/engines/table-engines/integrations/postgresql.md +++ b/docs/ru/engines/table-engines/integrations/postgresql.md @@ -49,6 +49,7 @@ PostgreSQL массивы конвертируются в массивы ClickHo :::info "Внимание" Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустимы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. +::: Поддерживает несколько реплик, которые должны быть перечислены через `|`. Например: diff --git a/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md index 6b5d2e862bf..e30e771c4df 100644 --- a/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -40,6 +40,7 @@ ORDER BY (CounterID, StartDate, intHash32(UserID)); :::info "Info" Не рекомендуется делать слишком гранулированное партиционирование – то есть задавать партиции по столбцу, в котором будет слишком большой разброс значений (речь идет о порядке более тысячи партиций). Это приведет к скоплению большого числа файлов и файловых дескрипторов в системе, что может значительно снизить производительность запросов `SELECT`. +::: Чтобы получить набор кусков и партиций таблицы, можно воспользоваться системной таблицей [system.parts](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md#system_tables-parts). В качестве примера рассмотрим таблицу `visits`, в которой задано партиционирование по месяцам. Выполним `SELECT` для таблицы `system.parts`: @@ -80,6 +81,7 @@ WHERE table = 'visits' :::info "Info" Названия кусков для таблиц старого типа образуются следующим образом: `20190117_20190123_2_2_0` (минимальная дата _ максимальная дата _ номер минимального блока _ номер максимального блока _ уровень). +::: Как видно из примера выше, таблица содержит несколько отдельных кусков для одной и той же партиции (например, куски `201901_1_3_1` и `201901_1_9_2` принадлежат партиции `201901`). Это означает, что эти куски еще не были объединены – в файловой системе они хранятся отдельно. После того как будет выполнено автоматическое слияние данных (выполняется примерно спустя 10 минут после вставки данных), исходные куски будут объединены в один более крупный кусок и помечены как неактивные. diff --git a/docs/ru/faq/general/dbms-naming.md b/docs/ru/faq/general/dbms-naming.md index 9bc036cc2e4..dd58d89924e 100644 --- a/docs/ru/faq/general/dbms-naming.md +++ b/docs/ru/faq/general/dbms-naming.md @@ -14,3 +14,4 @@ sidebar_position: 10 :::info "Забавный факт" Спустя годы после того, как ClickHouse получил свое название, принцип комбинирования двух слов, каждое из которых имеет подходящий смысл, был признан лучшим способом назвать базу данных в [исследовании Andy Pavlo](https://www.cs.cmu.edu/~pavlo/blog/2020/03/on-naming-a-database-management-system.html), Associate Professor of Databases в Carnegie Mellon University. ClickHouse разделил награду "за лучшее название СУБД" с Postgres. +::: \ No newline at end of file diff --git a/docs/ru/faq/general/index.md b/docs/ru/faq/general/index.md index 81715a64acd..7d34cc643f2 100644 --- a/docs/ru/faq/general/index.md +++ b/docs/ru/faq/general/index.md @@ -20,5 +20,6 @@ sidebar_label: Общие вопросы :::info "Если вы не нашли то, что искали:" Загляните в другие категории F.A.Q. или поищите в остальных разделах документации, ориентируясь по оглавлению слева. +::: [Original article](https://clickhouse.com/docs/ru/faq/general/) diff --git a/docs/ru/faq/general/why-clickhouse-is-so-fast.md b/docs/ru/faq/general/why-clickhouse-is-so-fast.md index 0c74cb2cfa2..43b3c818249 100644 --- a/docs/ru/faq/general/why-clickhouse-is-so-fast.md +++ b/docs/ru/faq/general/why-clickhouse-is-so-fast.md @@ -60,3 +60,4 @@ sidebar_position: 8 - Ориентируйтесь на показатели, собранные при работе с реальными данными. - Проверяйте производительность в процессе CI. - Измеряйте и анализируйте всё, что только возможно. +::: diff --git a/docs/ru/faq/integration/index.md b/docs/ru/faq/integration/index.md index cc917718000..ee01688af6e 100644 --- a/docs/ru/faq/integration/index.md +++ b/docs/ru/faq/integration/index.md @@ -15,5 +15,6 @@ sidebar_label: Интеграция :::info "Если вы не нашли то, что искали" Загляните в другие подразделы F.A.Q. или поищите в остальных разделах документации, ориентируйтесь по оглавлению слева. +::: [Original article](https://clickhouse.com/docs/ru/faq/integration/) diff --git a/docs/ru/faq/operations/index.md b/docs/ru/faq/operations/index.md index bd5ba13e182..aab31ec3305 100644 --- a/docs/ru/faq/operations/index.md +++ b/docs/ru/faq/operations/index.md @@ -14,5 +14,6 @@ sidebar_label: Операции :::info "Если вы не нашли то, что искали" Загляните в другие подразделы F.A.Q. или поищите в остальных разделах документации, ориентируйтесь по оглавлению слева. +::: [Original article](https://clickhouse.com/docs/en/faq/operations/) diff --git a/docs/ru/getting-started/example-datasets/nyc-taxi.md b/docs/ru/getting-started/example-datasets/nyc-taxi.md index 991d7dafe05..9d9caa43b5e 100644 --- a/docs/ru/getting-started/example-datasets/nyc-taxi.md +++ b/docs/ru/getting-started/example-datasets/nyc-taxi.md @@ -293,6 +293,7 @@ $ clickhouse-client --query "SELECT COUNT(*) FROM datasets.trips_mergetree" :::info "Info" Если вы собираетесь выполнять запросы, приведенные ниже, то к имени таблицы нужно добавить имя базы, `datasets.trips_mergetree`. +::: ## Результаты на одном сервере {#rezultaty-na-odnom-servere} diff --git a/docs/ru/getting-started/example-datasets/ontime.md b/docs/ru/getting-started/example-datasets/ontime.md index b7c4a2f952c..8dab19c7bae 100644 --- a/docs/ru/getting-started/example-datasets/ontime.md +++ b/docs/ru/getting-started/example-datasets/ontime.md @@ -157,6 +157,7 @@ $ clickhouse-client --query "SELECT COUNT(*) FROM datasets.ontime" :::info "Info" Если вы собираетесь выполнять запросы, приведенные ниже, то к имени таблицы нужно добавить имя базы, `datasets.ontime`. +::: ## Запросы: {#zaprosy} diff --git a/docs/ru/operations/external-authenticators/kerberos.md b/docs/ru/operations/external-authenticators/kerberos.md index 9575d935836..197bf5a6047 100644 --- a/docs/ru/operations/external-authenticators/kerberos.md +++ b/docs/ru/operations/external-authenticators/kerberos.md @@ -99,6 +99,7 @@ ClickHouse предоставляет возможность аутентифи :::info "" Ещё раз отметим, что кроме `users.xml`, необходимо также включить Kerberos в `config.xml`. +::: ### Настройка Kerberos через SQL {#enabling-kerberos-using-sql} diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md index 222c6bccfaf..f1d7280892e 100644 --- a/docs/ru/operations/server-configuration-parameters/settings.md +++ b/docs/ru/operations/server-configuration-parameters/settings.md @@ -174,6 +174,7 @@ ClickHouse проверяет условия для `min_part_size` и `min_part :::info "Примечание" Жесткое ограничение настраивается с помощью системных инструментов. +::: **Пример** @@ -706,6 +707,7 @@ ClickHouse поддерживает динамическое изменение :::info "Примечание" Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. +::: Возможные значения: @@ -726,6 +728,7 @@ ClickHouse поддерживает динамическое изменение :::info "Примечание" Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. +::: Возможные значения: @@ -746,6 +749,7 @@ ClickHouse поддерживает динамическое изменение :::info "Примечание" Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. +::: Возможные значения: diff --git a/docs/ru/operations/system-tables/mutations.md b/docs/ru/operations/system-tables/mutations.md index f2047745c0c..f3810e29698 100644 --- a/docs/ru/operations/system-tables/mutations.md +++ b/docs/ru/operations/system-tables/mutations.md @@ -30,6 +30,7 @@ :::info "Замечание" Даже если `parts_to_do = 0`, для реплицированной таблицы возможна ситуация, когда мутация ещё не завершена из-за долго выполняющейся операции `INSERT`, которая добавляет данные, которые нужно будет мутировать. +::: Если во время мутации какого-либо куска возникли проблемы, заполняются следующие столбцы: diff --git a/docs/ru/sql-reference/aggregate-functions/reference/deltasum.md b/docs/ru/sql-reference/aggregate-functions/reference/deltasum.md index f46d7ca6111..49edc3932e0 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/deltasum.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/deltasum.md @@ -8,6 +8,7 @@ sidebar_position: 141 :::info "Примечание" Чтобы эта функция работала должным образом, исходные данные должны быть отсортированы. В [материализованном представлении](../../../sql-reference/statements/create/view.md#materialized) вместо нее рекомендуется использовать [deltaSumTimestamp](../../../sql-reference/aggregate-functions/reference/deltasumtimestamp.md#agg_functions-deltasumtimestamp). +::: **Синтаксис** diff --git a/docs/ru/sql-reference/aggregate-functions/reference/intervalLengthSum.md b/docs/ru/sql-reference/aggregate-functions/reference/intervalLengthSum.md index 46dac961639..45b41bd13a3 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/intervalLengthSum.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/intervalLengthSum.md @@ -20,6 +20,7 @@ intervalLengthSum(start, end) :::info "Примечание" Аргументы должны быть одного типа. В противном случае ClickHouse сгенерирует исключение. +::: **Возвращаемое значение** diff --git a/docs/ru/sql-reference/data-types/nullable.md b/docs/ru/sql-reference/data-types/nullable.md index f018409772d..31a3674af6b 100644 --- a/docs/ru/sql-reference/data-types/nullable.md +++ b/docs/ru/sql-reference/data-types/nullable.md @@ -26,6 +26,7 @@ sidebar_label: Nullable :::info "Info" Почти всегда использование `Nullable` снижает производительность, учитывайте это при проектировании своих баз. +::: ## Поиск NULL {#finding-null} diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index ac03dd39047..a80fedfbb24 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -464,6 +464,7 @@ SOURCE(ODBC( :::info "Примечание" Поля `table` и `query` не могут быть использованы вместе. Также обязательно должен быть один из источников данных: `table` или `query`. +::: ClickHouse получает от ODBC-драйвера информацию о квотировании и квотирует настройки в запросах к драйверу, поэтому имя таблицы нужно указывать в соответствии с регистром имени таблицы в базе данных. @@ -543,6 +544,7 @@ SOURCE(MYSQL( :::info "Примечание" Поля `table` или `where` не могут быть использованы вместе с полем `query`. Также обязательно должен быть один из источников данных: `table` или `query`. Явный параметр `secure` отсутствует. Автоматически поддержана работа в обоих случаях: когда установка SSL-соединения необходима и когда нет. +::: MySQL можно подключить на локальном хосте через сокеты, для этого необходимо задать `host` и `socket`. @@ -633,6 +635,7 @@ SOURCE(CLICKHOUSE( :::info "Примечание" Поля `table` или `where` не могут быть использованы вместе с полем `query`. Также обязательно должен быть один из источников данных: `table` или `query`. +::: ### MongoDB {#dicts-external_dicts_dict_sources-mongodb} @@ -748,6 +751,7 @@ SOURCE(REDIS( :::info "Примечание" Поля `column_family` или `where` не могут быть использованы вместе с полем `query`. Также обязательно должен быть один из источников данных: `column_family` или `query`. +::: ### PostgreSQL {#dicts-external_dicts_dict_sources-postgresql} @@ -804,3 +808,4 @@ SOURCE(POSTGRESQL( :::info "Примечание" Поля `table` или `where` не могут быть использованы вместе с полем `query`. Также обязательно должен быть один из источников данных: `table` или `query`. +::: diff --git a/docs/ru/sql-reference/functions/geo/geohash.md b/docs/ru/sql-reference/functions/geo/geohash.md index fe853cef267..933775dcfbe 100644 --- a/docs/ru/sql-reference/functions/geo/geohash.md +++ b/docs/ru/sql-reference/functions/geo/geohash.md @@ -86,6 +86,7 @@ geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precisi :::info "Замечание" Все передаваемые координаты должны быть одного и того же типа: либо `Float32`, либо `Float64`. +::: **Возвращаемые значения** @@ -96,6 +97,7 @@ geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precisi :::info "Замечание" Если возвращаемый массив содержит свыше 10 000 000 элементов, функция сгенерирует исключение. +::: **Пример** diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 4e3bae9ddb7..a7279b548e2 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -1209,6 +1209,7 @@ SELECT toLowCardinality('1'); :::info "Примечание" Возвращаемое значение — это временная метка в UTC, а не в часовом поясе `DateTime64`. +::: **Синтаксис** diff --git a/docs/ru/sql-reference/statements/alter/index.md b/docs/ru/sql-reference/statements/alter/index.md index fd3f6c8adf6..0191c794e9c 100644 --- a/docs/ru/sql-reference/statements/alter/index.md +++ b/docs/ru/sql-reference/statements/alter/index.md @@ -25,7 +25,7 @@ ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN - [CONSTRAINT](../../../sql-reference/statements/alter/constraint.md) - [TTL](../../../sql-reference/statements/alter/ttl.md) - :::note + :::note Запрос `ALTER TABLE` поддерживается только для таблиц типа `*MergeTree`, а также `Merge` и `Distributed`. Запрос имеет несколько вариантов. ::: Следующие запросы `ALTER` управляют представлениями: @@ -77,5 +77,6 @@ ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name :::info "Примечание" Для всех запросов `ALTER` при `replication_alter_partitions_sync = 2` и неактивности некоторых реплик больше времени, заданного настройкой `replication_wait_for_inactive_replica_timeout`, генерируется исключение `UNFINISHED`. +::: Для запросов `ALTER TABLE ... UPDATE|DELETE` синхронность выполнения определяется настройкой [mutations_sync](../../../operations/settings/settings.md#mutations_sync). diff --git a/docs/ru/sql-reference/statements/create/user.md b/docs/ru/sql-reference/statements/create/user.md index d7da1748821..683e56a61b3 100644 --- a/docs/ru/sql-reference/statements/create/user.md +++ b/docs/ru/sql-reference/statements/create/user.md @@ -56,6 +56,7 @@ CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] :::info "Внимание" ClickHouse трактует конструкцию `user_name@'address'` как имя пользователя целиком. То есть технически вы можете создать несколько пользователей с одинаковыми `user_name`, но разными частями конструкции после `@`, но лучше так не делать. +::: ## Секция GRANTEES {#grantees} diff --git a/docs/ru/sql-reference/statements/create/view.md b/docs/ru/sql-reference/statements/create/view.md index 9a2db0ac2de..a0193cea21c 100644 --- a/docs/ru/sql-reference/statements/create/view.md +++ b/docs/ru/sql-reference/statements/create/view.md @@ -88,6 +88,7 @@ LIVE-представления работают по тому же принци - `LIVE VIEW` не обновляется, если в исходном запросе используются несколько таблиц. В случаях, когда `LIVE VIEW` не обновляется автоматически, чтобы обновлять его принудительно с заданной периодичностью, используйте [WITH REFRESH](#live-view-with-refresh). +::: ### Отслеживание изменений LIVE-представлений {#live-view-monitoring} diff --git a/docs/ru/sql-reference/statements/optimize.md b/docs/ru/sql-reference/statements/optimize.md index b0b71ae412c..61480a0c1ab 100644 --- a/docs/ru/sql-reference/statements/optimize.md +++ b/docs/ru/sql-reference/statements/optimize.md @@ -30,6 +30,7 @@ ClickHouse не оповещает клиента. Чтобы включить :::info "Примечание" Если значение настройки `replication_alter_partitions_sync` равно `2` и некоторые реплики не активны больше времени, заданного настройкой `replication_wait_for_inactive_replica_timeout`, то генерируется исключение `UNFINISHED`. +::: ## Выражение BY {#by-expression} diff --git a/docs/ru/sql-reference/statements/show.md b/docs/ru/sql-reference/statements/show.md index 0926cc0863e..3e7560b0882 100644 --- a/docs/ru/sql-reference/statements/show.md +++ b/docs/ru/sql-reference/statements/show.md @@ -368,6 +368,7 @@ SHOW ACCESS :::info "Note" По запросу `SHOW CLUSTER name` вы получите содержимое таблицы system.clusters для этого кластера. +::: ### Синтаксис {#show-cluster-syntax} diff --git a/docs/ru/sql-reference/statements/truncate.md b/docs/ru/sql-reference/statements/truncate.md index 63cf6271d72..cac6c261f21 100644 --- a/docs/ru/sql-reference/statements/truncate.md +++ b/docs/ru/sql-reference/statements/truncate.md @@ -19,3 +19,4 @@ TRUNCATE TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] :::info "Примечание" Если значение настройки `replication_alter_partitions_sync` равно `2` и некоторые реплики не активны больше времени, заданного настройкой `replication_wait_for_inactive_replica_timeout`, то генерируется исключение `UNFINISHED`. +::: diff --git a/docs/ru/sql-reference/statements/watch.md b/docs/ru/sql-reference/statements/watch.md index 227a6e26c02..f925e25b0d5 100644 --- a/docs/ru/sql-reference/statements/watch.md +++ b/docs/ru/sql-reference/statements/watch.md @@ -104,3 +104,4 @@ WATCH lv EVENTS LIMIT 1; :::info "Примечание" При отслеживании [LIVE VIEW](./create/view.md#live-view) через интерфейс HTTP следует использовать формат [JSONEachRowWithProgress](../../interfaces/formats.md#jsoneachrowwithprogress). Постоянные сообщения об изменениях будут добавлены в поток вывода для поддержания активности долговременного HTTP-соединения до тех пор, пока результат запроса изменяется. Проомежуток времени между сообщениями об изменениях управляется настройкой[live_view_heartbeat_interval](./create/view.md#live-view-settings). +::: \ No newline at end of file diff --git a/docs/ru/sql-reference/table-functions/postgresql.md b/docs/ru/sql-reference/table-functions/postgresql.md index 3408951c690..2e1e5314f91 100644 --- a/docs/ru/sql-reference/table-functions/postgresql.md +++ b/docs/ru/sql-reference/table-functions/postgresql.md @@ -28,6 +28,7 @@ postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`]) :::info "Примечание" В запросах `INSERT` для того чтобы отличить табличную функцию `postgresql(...)` от таблицы со списком имен столбцов вы должны указывать ключевые слова `FUNCTION` или `TABLE FUNCTION`. См. примеры ниже. +::: ## Особенности реализации {#implementation-details} @@ -43,6 +44,7 @@ PostgreSQL массивы конвертируются в массивы ClickHo :::info "Примечание" Будьте внимательны, в PostgreSQL массивы, созданные как `type_name[]`, являются многомерными и могут содержать в себе разное количество измерений в разных строках одной таблицы. Внутри ClickHouse допустипы только многомерные массивы с одинаковым кол-вом измерений во всех строках таблицы. +::: Поддерживает несколько реплик, которые должны быть перечислены через `|`. Например: From 6a30c23252a41ada572bad7e54e13535d295a0df Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 10 Aug 2022 21:48:00 +0200 Subject: [PATCH 143/164] tests/performance: cover sparse_hashed dictionary (#40027) Signed-off-by: Azat Khuzhin Signed-off-by: Azat Khuzhin --- tests/performance/hashed_dictionary.xml | 193 ++++++++++++------------ 1 file changed, 100 insertions(+), 93 deletions(-) diff --git a/tests/performance/hashed_dictionary.xml b/tests/performance/hashed_dictionary.xml index cf1cdac6df1..01ee35c8ed4 100644 --- a/tests/performance/hashed_dictionary.xml +++ b/tests/performance/hashed_dictionary.xml @@ -1,72 +1,4 @@ - - CREATE TABLE simple_key_hashed_dictionary_source_table - ( - id UInt64, - value_int UInt64, - value_string String, - value_decimal Decimal64(8), - value_string_nullable Nullable(String) - ) ENGINE = Memory; - - - - CREATE TABLE complex_key_hashed_dictionary_source_table - ( - id UInt64, - id_key String, - value_int UInt64, - value_string String, - value_decimal Decimal64(8), - value_string_nullable Nullable(String) - ) ENGINE = Memory; - - - - CREATE DICTIONARY simple_key_hashed_dictionary - ( - id UInt64, - value_int UInt64, - value_string String, - value_decimal Decimal64(8), - value_string_nullable Nullable(String) - ) - PRIMARY KEY id - SOURCE(CLICKHOUSE(DB 'default' TABLE 'simple_key_hashed_dictionary_source_table')) - LAYOUT(HASHED()) - LIFETIME(MIN 0 MAX 1000); - - - - CREATE DICTIONARY complex_key_hashed_dictionary - ( - id UInt64, - id_key String, - value_int UInt64, - value_string String, - value_decimal Decimal64(8), - value_string_nullable Nullable(String) - ) - PRIMARY KEY id, id_key - SOURCE(CLICKHOUSE(DB 'default' TABLE 'complex_key_hashed_dictionary_source_table')) - LAYOUT(COMPLEX_KEY_HASHED()) - LIFETIME(MIN 0 MAX 1000); - - - - INSERT INTO simple_key_hashed_dictionary_source_table - SELECT number, number, toString(number), toDecimal64(number, 8), toString(number) - FROM system.numbers - LIMIT 5000000; - - - - INSERT INTO complex_key_hashed_dictionary_source_table - SELECT number, toString(number), number, toString(number), toDecimal64(number, 8), toString(number) - FROM system.numbers - LIMIT 5000000; - - column_name @@ -85,54 +17,129 @@ 7500000 + + + layout_suffix + + HASHED + SPARSE_HASHED + + + + CREATE TABLE simple_key_dictionary_source_table + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) ENGINE = Memory + + + + CREATE TABLE complex_key_dictionary_source_table + ( + id UInt64, + id_key String, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) ENGINE = Memory + + + + CREATE DICTIONARY IF NOT EXISTS simple_key_{layout_suffix}_dictionary + ( + id UInt64, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) + PRIMARY KEY id + SOURCE(CLICKHOUSE(TABLE 'simple_key_dictionary_source_table')) + LAYOUT({layout_suffix}()) + LIFETIME(0) + + + + CREATE DICTIONARY IF NOT EXISTS complex_key_{layout_suffix}_dictionary + ( + id UInt64, + id_key String, + value_int UInt64, + value_string String, + value_decimal Decimal64(8), + value_string_nullable Nullable(String) + ) + PRIMARY KEY id, id_key + SOURCE(CLICKHOUSE(TABLE 'complex_key_dictionary_source_table')) + LAYOUT(COMPLEX_KEY_{layout_suffix}()) + LIFETIME(0) + + + + INSERT INTO simple_key_dictionary_source_table + SELECT number, number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 5000000 + + + + INSERT INTO complex_key_dictionary_source_table + SELECT number, toString(number), number, toString(number), toDecimal64(number, 8), toString(number) + FROM system.numbers + LIMIT 5000000 + + + SYSTEM RELOAD DICTIONARY simple_key_{layout_suffix}_dictionary + SYSTEM RELOAD DICTIONARY complex_key_{layout_suffix}_dictionary + + SYSTEM RELOAD DICTIONARY simple_key_{layout_suffix}_dictionary + SYSTEM RELOAD DICTIONARY complex_key_{layout_suffix}_dictionary + WITH rand64() % toUInt64({elements_count}) as key - SELECT dictGet('default.simple_key_hashed_dictionary', {column_name}, key) + SELECT dictGet('default.simple_key_{layout_suffix}_dictionary', {column_name}, key) FROM system.numbers LIMIT {elements_count} - FORMAT Null; + FORMAT Null WITH rand64() % toUInt64({elements_count}) as key - SELECT dictHas('default.simple_key_hashed_dictionary', key) + SELECT dictHas('default.simple_key_{layout_suffix}_dictionary', key) FROM system.numbers LIMIT {elements_count} - FORMAT Null; + FORMAT Null + SELECT * FROM simple_key_{layout_suffix}_dictionary FORMAT Null + - SELECT * FROM simple_key_hashed_dictionary - FORMAT Null; + WITH (rand64() % toUInt64({elements_count}), toString(rand64() % toUInt64({elements_count}))) as key + SELECT dictGet('default.complex_key_{layout_suffix}_dictionary', {column_name}, key) + FROM system.numbers + LIMIT {elements_count} + FORMAT Null WITH (rand64() % toUInt64({elements_count}), toString(rand64() % toUInt64({elements_count}))) as key - SELECT dictGet('default.complex_key_hashed_dictionary', {column_name}, key) + SELECT dictHas('default.complex_key_{layout_suffix}_dictionary', key) FROM system.numbers LIMIT {elements_count} - FORMAT Null; + FORMAT Null - - WITH (rand64() % toUInt64({elements_count}), toString(rand64() % toUInt64({elements_count}))) as key - SELECT dictHas('default.complex_key_hashed_dictionary', key) - FROM system.numbers - LIMIT {elements_count} - FORMAT Null; - + SELECT * FROM complex_key_{layout_suffix}_dictionary FORMAT Null - - SELECT * FROM complex_key_hashed_dictionary - FORMAT Null; - - - DROP TABLE IF EXISTS simple_key_hashed_dictionary_source_table; - DROP TABLE IF EXISTS complex_key_hashed_dictionary_source_table; - - DROP DICTIONARY IF EXISTS simple_key_hashed_dictionary; - DROP DICTIONARY IF EXISTS complex_key_hashed_dictionary; + DROP TABLE IF EXISTS simple_key_dictionary_source_table + DROP TABLE IF EXISTS complex_key_dictionary_source_table + DROP DICTIONARY IF EXISTS simple_key_{layout_suffix}_dictionary + DROP DICTIONARY IF EXISTS complex_key_{layout_suffix}_dictionary From ce9c0c2da3459d63c5a95c7495f47e7467624012 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 10 Aug 2022 21:53:11 +0200 Subject: [PATCH 144/164] Style --- src/Interpreters/UserDefinedSQLFunctionFactory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/UserDefinedSQLFunctionFactory.cpp b/src/Interpreters/UserDefinedSQLFunctionFactory.cpp index db11ee12b03..2f876f00cc3 100644 --- a/src/Interpreters/UserDefinedSQLFunctionFactory.cpp +++ b/src/Interpreters/UserDefinedSQLFunctionFactory.cpp @@ -163,6 +163,6 @@ std::vector UserDefinedSQLFunctionFactory::getAllRegisteredNames() bool UserDefinedSQLFunctionFactory::empty() const { std::lock_guard lock(mutex); - return function_name_to_create_query.size() == 0; + return function_name_to_create_query.empty(); } } From d7a545e30d27079c81c3ad9c0c37b04de9f56e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 10 Aug 2022 22:05:09 +0200 Subject: [PATCH 145/164] Try to optimize CurrentMemoryTracker alloc and free --- src/Common/CurrentThread.cpp | 7 ------- src/Common/CurrentThread.h | 7 ++++++- src/Common/ThreadStatus.cpp | 10 ++++++++-- src/Common/ThreadStatus.h | 10 ++++++++-- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/Common/CurrentThread.cpp b/src/Common/CurrentThread.cpp index e3f6b63b28a..a2cdf9c8361 100644 --- a/src/Common/CurrentThread.cpp +++ b/src/Common/CurrentThread.cpp @@ -43,13 +43,6 @@ ProfileEvents::Counters & CurrentThread::getProfileEvents() return current_thread ? current_thread->performance_counters : ProfileEvents::global_counters; } -MemoryTracker * CurrentThread::getMemoryTracker() -{ - if (unlikely(!current_thread)) - return nullptr; - return ¤t_thread->memory_tracker; -} - void CurrentThread::updateProgressIn(const Progress & value) { if (unlikely(!current_thread)) diff --git a/src/Common/CurrentThread.h b/src/Common/CurrentThread.h index fa52fafa9e2..cbe60365798 100644 --- a/src/Common/CurrentThread.h +++ b/src/Common/CurrentThread.h @@ -54,7 +54,12 @@ public: static void updatePerformanceCounters(); static ProfileEvents::Counters & getProfileEvents(); - static MemoryTracker * getMemoryTracker(); + inline ALWAYS_INLINE static MemoryTracker * getMemoryTracker() + { + if (unlikely(!current_thread)) + return nullptr; + return ¤t_thread->memory_tracker; + } /// Update read and write rows (bytes) statistics (used in system.query_thread_log) static void updateProgressIn(const Progress & value); diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index 4dd32f7ff10..98f78cada5c 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -24,9 +24,15 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } - -thread_local ThreadStatus * current_thread = nullptr; +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc++20-compat" +#endif +thread_local ThreadStatus constinit * current_thread = nullptr; thread_local ThreadStatus * main_thread = nullptr; +#ifdef __clang__ +#pragma clang diagnostic pop +#endif #if !defined(SANITIZER) namespace diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 7c22d3b8335..594e86ffa2e 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -102,8 +102,14 @@ public: using ThreadGroupStatusPtr = std::shared_ptr; - -extern thread_local ThreadStatus * current_thread; +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc++20-compat" +#endif +extern thread_local constinit ThreadStatus * current_thread; +#ifdef __clang__ +#pragma clang diagnostic pop +#endif /** Encapsulates all per-thread info (ProfileEvents, MemoryTracker, query_id, query context, etc.). * The object must be created in thread function and destroyed in the same thread before the exit. From 5934c6519f210eba32a83abd377664e6ce3e3bd6 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 11 Aug 2022 00:06:01 +0200 Subject: [PATCH 146/164] Regenerate changelogs and update versions --- docs/changelogs/v22.3.1.1262-prestable.md | 2 +- docs/changelogs/v22.3.2.2-lts.md | 2 +- docs/changelogs/v22.3.3.44-lts.md | 2 +- docs/changelogs/v22.3.4.20-lts.md | 2 +- docs/changelogs/v22.3.5.5-lts.md | 2 +- docs/changelogs/v22.3.6.5-lts.md | 2 +- docs/changelogs/v22.3.7.28-lts.md | 2 +- docs/changelogs/v22.4.1.2305-prestable.md | 10 +++++----- docs/changelogs/v22.4.2.1-stable.md | 2 +- docs/changelogs/v22.4.3.3-stable.md | 2 +- docs/changelogs/v22.4.4.7-stable.md | 2 +- docs/changelogs/v22.4.5.9-stable.md | 2 +- docs/changelogs/v22.5.1.2079-stable.md | 8 ++++---- docs/changelogs/v22.6.1.1985-stable.md | 8 ++++---- docs/changelogs/v22.6.2.12-stable.md | 2 +- docs/changelogs/v22.6.3.35-stable.md | 2 +- docs/changelogs/v22.6.4.35-stable.md | 2 +- utils/list-versions/version_date.tsv | 3 +++ 18 files changed, 30 insertions(+), 27 deletions(-) diff --git a/docs/changelogs/v22.3.1.1262-prestable.md b/docs/changelogs/v22.3.1.1262-prestable.md index 0058396d634..6c16a92ed26 100644 --- a/docs/changelogs/v22.3.1.1262-prestable.md +++ b/docs/changelogs/v22.3.1.1262-prestable.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.3.1.1262-prestable FIXME as compared to v22.2.1.2139-prestable +### ClickHouse release v22.3.1.1262-prestable (92ab33f560e) FIXME as compared to v22.2.1.2139-prestable (75366fc95e5) #### Backward Incompatible Change * Improvement the toDatetime function overflows. When the date string is very large, it will be converted to 1970. [#32898](https://github.com/ClickHouse/ClickHouse/pull/32898) ([HaiBo Li](https://github.com/marising)). diff --git a/docs/changelogs/v22.3.2.2-lts.md b/docs/changelogs/v22.3.2.2-lts.md index b755db300c8..b873926e068 100644 --- a/docs/changelogs/v22.3.2.2-lts.md +++ b/docs/changelogs/v22.3.2.2-lts.md @@ -5,5 +5,5 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.3.2.2-lts FIXME as compared to v22.3.1.1262-prestable +### ClickHouse release v22.3.2.2-lts (89a621679c6) FIXME as compared to v22.3.1.1262-prestable (92ab33f560e) diff --git a/docs/changelogs/v22.3.3.44-lts.md b/docs/changelogs/v22.3.3.44-lts.md index 4cd48eefa5a..d0da3397d1d 100644 --- a/docs/changelogs/v22.3.3.44-lts.md +++ b/docs/changelogs/v22.3.3.44-lts.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.3.3.44-lts FIXME as compared to v22.3.2.2-lts +### ClickHouse release v22.3.3.44-lts (abb756d3ca2) FIXME as compared to v22.3.2.2-lts (89a621679c6) #### Bug Fix * Backported in [#35928](https://github.com/ClickHouse/ClickHouse/issues/35928): Added settings `input_format_ipv4_default_on_conversion_error`, `input_format_ipv6_default_on_conversion_error` to allow insert of invalid ip address values as default into tables. Closes [#35726](https://github.com/ClickHouse/ClickHouse/issues/35726). [#35733](https://github.com/ClickHouse/ClickHouse/pull/35733) ([Maksim Kita](https://github.com/kitaisreal)). diff --git a/docs/changelogs/v22.3.4.20-lts.md b/docs/changelogs/v22.3.4.20-lts.md index d820adbdbec..34a8cd8c25a 100644 --- a/docs/changelogs/v22.3.4.20-lts.md +++ b/docs/changelogs/v22.3.4.20-lts.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.3.4.20-lts FIXME as compared to v22.3.3.44-lts +### ClickHouse release v22.3.4.20-lts (ecbaf001f49) FIXME as compared to v22.3.3.44-lts (abb756d3ca2) #### Build/Testing/Packaging Improvement * - Add `_le_` method for ClickHouseVersion - Fix auto_version for existing tag - docker_server now support getting version from tags - Add python unit tests to backport workflow. [#36028](https://github.com/ClickHouse/ClickHouse/pull/36028) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). diff --git a/docs/changelogs/v22.3.5.5-lts.md b/docs/changelogs/v22.3.5.5-lts.md index 7deff1be416..fc1332b68c6 100644 --- a/docs/changelogs/v22.3.5.5-lts.md +++ b/docs/changelogs/v22.3.5.5-lts.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.3.5.5-lts FIXME as compared to v22.3.4.20-lts +### ClickHouse release v22.3.5.5-lts (438b4a81f77) FIXME as compared to v22.3.4.20-lts (ecbaf001f49) #### Bug Fix (user-visible misbehaviour in official stable or prestable release) diff --git a/docs/changelogs/v22.3.6.5-lts.md b/docs/changelogs/v22.3.6.5-lts.md index 4b4772c611a..535fa0e7ad3 100644 --- a/docs/changelogs/v22.3.6.5-lts.md +++ b/docs/changelogs/v22.3.6.5-lts.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.3.6.5-lts FIXME as compared to v22.3.5.5-lts +### ClickHouse release v22.3.6.5-lts (3e44e824cff) FIXME as compared to v22.3.5.5-lts (438b4a81f77) #### Bug Fix (user-visible misbehaviour in official stable or prestable release) diff --git a/docs/changelogs/v22.3.7.28-lts.md b/docs/changelogs/v22.3.7.28-lts.md index 14cb8628f09..5a6900e7fa4 100644 --- a/docs/changelogs/v22.3.7.28-lts.md +++ b/docs/changelogs/v22.3.7.28-lts.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.3.7.28-lts FIXME as compared to v22.3.6.5-lts +### ClickHouse release v22.3.7.28-lts (420bdfa2751) FIXME as compared to v22.3.6.5-lts (3e44e824cff) #### Bug Fix (user-visible misbehavior in official stable or prestable release) diff --git a/docs/changelogs/v22.4.1.2305-prestable.md b/docs/changelogs/v22.4.1.2305-prestable.md index c202b0b9331..da1ba4a42bd 100644 --- a/docs/changelogs/v22.4.1.2305-prestable.md +++ b/docs/changelogs/v22.4.1.2305-prestable.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.4.1.2305-prestable FIXME as compared to v22.3.1.1262-prestable +### ClickHouse release v22.4.1.2305-prestable (77a82cc090d) FIXME as compared to v22.3.1.1262-prestable (92ab33f560e) #### Backward Incompatible Change * Function `yandexConsistentHash` (consistent hashing algorithm by Konstantin "kostik" Oblakov) is renamed to `kostikConsistentHash`. The old name is left as an alias for compatibility. Although this change is backward compatible, we may remove the alias in subsequent releases, that's why it's recommended to update the usages of this function in your apps. [#35553](https://github.com/ClickHouse/ClickHouse/pull/35553) ([Alexey Milovidov](https://github.com/alexey-milovidov)). @@ -68,7 +68,7 @@ sidebar_label: 2022 * For lts releases packages will be pushed to both lts and stable repos. [#35382](https://github.com/ClickHouse/ClickHouse/pull/35382) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). * Support uuid for postgres engines. Closes [#35384](https://github.com/ClickHouse/ClickHouse/issues/35384). [#35403](https://github.com/ClickHouse/ClickHouse/pull/35403) ([Kseniia Sumarokova](https://github.com/kssenii)). * Add arguments `--user`, `--password`, `--host`, `--port` for clickhouse-diagnostics. [#35422](https://github.com/ClickHouse/ClickHouse/pull/35422) ([李扬](https://github.com/taiyang-li)). -* fix INSERT INTO table FROM INFILE does not display progress bar. [#35429](https://github.com/ClickHouse/ClickHouse/pull/35429) ([xiedeyantu](https://github.com/xiedeyantu)). +* fix INSERT INTO table FROM INFILE does not display progress bar. [#35429](https://github.com/ClickHouse/ClickHouse/pull/35429) ([chen](https://github.com/xiedeyantu)). * Allow server to bind to low-numbered ports (e.g. 443). ClickHouse installation script will set `cap_net_bind_service` to the binary file. [#35451](https://github.com/ClickHouse/ClickHouse/pull/35451) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Add settings `input_format_orc_case_insensitive_column_matching`, `input_format_arrow_case_insensitive_column_matching`, and `input_format_parquet_case_insensitive_column_matching` which allows ClickHouse to use case insensitive matching of columns while reading data from ORC, Arrow or Parquet files. [#35459](https://github.com/ClickHouse/ClickHouse/pull/35459) ([Antonio Andelic](https://github.com/antonio2368)). * - Add explicit table info to the scan node of query plan and pipeline. [#35460](https://github.com/ClickHouse/ClickHouse/pull/35460) ([何李夫](https://github.com/helifu)). @@ -106,7 +106,7 @@ sidebar_label: 2022 * ASTPartition::formatImpl should output ALL while executing ALTER TABLE t DETACH PARTITION ALL. [#35987](https://github.com/ClickHouse/ClickHouse/pull/35987) ([awakeljw](https://github.com/awakeljw)). * `clickhouse-keeper` starts answering 4-letter commands before getting the quorum. [#35992](https://github.com/ClickHouse/ClickHouse/pull/35992) ([Antonio Andelic](https://github.com/antonio2368)). * Fix wrong assertion in replxx which happens when navigating back the history when the first line of input is a newline. Mark as improvement because it only affects debug build. This fixes [#34511](https://github.com/ClickHouse/ClickHouse/issues/34511). [#36007](https://github.com/ClickHouse/ClickHouse/pull/36007) ([Amos Bird](https://github.com/amosbird)). -* If someone writes DEFAULT NULL in table definition, make data type Nullable. [#35887](https://github.com/ClickHouse/ClickHouse/issues/35887). [#36058](https://github.com/ClickHouse/ClickHouse/pull/36058) ([xiedeyantu](https://github.com/xiedeyantu)). +* If someone writes DEFAULT NULL in table definition, make data type Nullable. [#35887](https://github.com/ClickHouse/ClickHouse/issues/35887). [#36058](https://github.com/ClickHouse/ClickHouse/pull/36058) ([chen](https://github.com/xiedeyantu)). * Added `thread_id` and `query_id` columns to `system.zookeeper_log` table. [#36074](https://github.com/ClickHouse/ClickHouse/pull/36074) ([Alexander Tokmakov](https://github.com/tavplubix)). * Auto assign numbers for Enum elements. [#36101](https://github.com/ClickHouse/ClickHouse/pull/36101) ([awakeljw](https://github.com/awakeljw)). * Reset thread name in `ThreadPool` to `ThreadPoolIdle` after job is done. This is to avoid displaying the old thread name for idle threads. This closes [#36114](https://github.com/ClickHouse/ClickHouse/issues/36114). [#36115](https://github.com/ClickHouse/ClickHouse/pull/36115) ([Alexey Milovidov](https://github.com/alexey-milovidov)). @@ -331,7 +331,7 @@ sidebar_label: 2022 * ci: replace directory system log tables artifacts with tsv [#35773](https://github.com/ClickHouse/ClickHouse/pull/35773) ([Azat Khuzhin](https://github.com/azat)). * One more try to resurrect build hash [#35774](https://github.com/ClickHouse/ClickHouse/pull/35774) ([alesapin](https://github.com/alesapin)). * Refactoring QueryPipeline [#35789](https://github.com/ClickHouse/ClickHouse/pull/35789) ([Amos Bird](https://github.com/amosbird)). -* Delete duplicate code [#35798](https://github.com/ClickHouse/ClickHouse/pull/35798) ([xiedeyantu](https://github.com/xiedeyantu)). +* Delete duplicate code [#35798](https://github.com/ClickHouse/ClickHouse/pull/35798) ([chen](https://github.com/xiedeyantu)). * remove unused variable [#35800](https://github.com/ClickHouse/ClickHouse/pull/35800) ([flynn](https://github.com/ucasfl)). * Make `SortDescription::column_name` always non-empty [#35805](https://github.com/ClickHouse/ClickHouse/pull/35805) ([Nikita Taranov](https://github.com/nickitat)). * Fix latest_error referenced before assignment [#35807](https://github.com/ClickHouse/ClickHouse/pull/35807) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). @@ -417,7 +417,7 @@ sidebar_label: 2022 * Revert reverting "Fix crash in ParallelReadBuffer" [#36212](https://github.com/ClickHouse/ClickHouse/pull/36212) ([Kruglov Pavel](https://github.com/Avogar)). * Make stateless tests with s3 always green [#36214](https://github.com/ClickHouse/ClickHouse/pull/36214) ([Alexander Tokmakov](https://github.com/tavplubix)). * Add Tyler Hannan to contributors [#36216](https://github.com/ClickHouse/ClickHouse/pull/36216) ([Tyler Hannan](https://github.com/tylerhannan)). -* Fix the repeated call of func to get the table when drop table [#36248](https://github.com/ClickHouse/ClickHouse/pull/36248) ([xiedeyantu](https://github.com/xiedeyantu)). +* Fix the repeated call of func to get the table when drop table [#36248](https://github.com/ClickHouse/ClickHouse/pull/36248) ([chen](https://github.com/xiedeyantu)). * Split test 01675_data_type_coroutine into 2 tests to prevent possible timeouts [#36250](https://github.com/ClickHouse/ClickHouse/pull/36250) ([Kruglov Pavel](https://github.com/Avogar)). * Merge TRUSTED_CONTRIBUTORS in lambda and import in check [#36252](https://github.com/ClickHouse/ClickHouse/pull/36252) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). * Fix exception "File segment can be completed only by downloader" in tests [#36253](https://github.com/ClickHouse/ClickHouse/pull/36253) ([Kseniia Sumarokova](https://github.com/kssenii)). diff --git a/docs/changelogs/v22.4.2.1-stable.md b/docs/changelogs/v22.4.2.1-stable.md index fb77d3fee9b..2bd245e00e7 100644 --- a/docs/changelogs/v22.4.2.1-stable.md +++ b/docs/changelogs/v22.4.2.1-stable.md @@ -5,5 +5,5 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.4.2.1-stable FIXME as compared to v22.4.1.2305-prestable +### ClickHouse release v22.4.2.1-stable (b34ebdc36ae) FIXME as compared to v22.4.1.2305-prestable (77a82cc090d) diff --git a/docs/changelogs/v22.4.3.3-stable.md b/docs/changelogs/v22.4.3.3-stable.md index 4baa63672ab..3b0f1e11cd5 100644 --- a/docs/changelogs/v22.4.3.3-stable.md +++ b/docs/changelogs/v22.4.3.3-stable.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.4.3.3-stable FIXME as compared to v22.4.2.1-stable +### ClickHouse release v22.4.3.3-stable (def956d6299) FIXME as compared to v22.4.2.1-stable (b34ebdc36ae) #### Bug Fix (user-visible misbehaviour in official stable or prestable release) diff --git a/docs/changelogs/v22.4.4.7-stable.md b/docs/changelogs/v22.4.4.7-stable.md index 71e077ac071..a0bc92db3e8 100644 --- a/docs/changelogs/v22.4.4.7-stable.md +++ b/docs/changelogs/v22.4.4.7-stable.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.4.4.7-stable FIXME as compared to v22.4.3.3-stable +### ClickHouse release v22.4.4.7-stable (ba44414f9b3) FIXME as compared to v22.4.3.3-stable (def956d6299) #### Bug Fix (user-visible misbehaviour in official stable or prestable release) diff --git a/docs/changelogs/v22.4.5.9-stable.md b/docs/changelogs/v22.4.5.9-stable.md index 636ad2ed3ac..a80dfd01a2b 100644 --- a/docs/changelogs/v22.4.5.9-stable.md +++ b/docs/changelogs/v22.4.5.9-stable.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.4.5.9-stable FIXME as compared to v22.4.4.7-stable +### ClickHouse release v22.4.5.9-stable (059ef6cadcd) FIXME as compared to v22.4.4.7-stable (ba44414f9b3) #### Bug Fix (user-visible misbehaviour in official stable or prestable release) diff --git a/docs/changelogs/v22.5.1.2079-stable.md b/docs/changelogs/v22.5.1.2079-stable.md index dfdcad64561..f6ca6fcc478 100644 --- a/docs/changelogs/v22.5.1.2079-stable.md +++ b/docs/changelogs/v22.5.1.2079-stable.md @@ -5,10 +5,10 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.5.1.2079-stable FIXME as compared to v22.4.1.2305-prestable +### ClickHouse release v22.5.1.2079-stable (df0cb062098) FIXME as compared to v22.4.1.2305-prestable (77a82cc090d) #### Backward Incompatible Change -* Updated the BoringSSL module to the official FIPS compliant version. This makes ClickHouse FIPS compliant. [#35914](https://github.com/ClickHouse/ClickHouse/pull/35914) ([Meena-Renganathan](https://github.com/Meena-Renganathan)). +* Updated the BoringSSL module to the official FIPS compliant version. This makes ClickHouse FIPS compliant. [#35914](https://github.com/ClickHouse/ClickHouse/pull/35914) ([Deleted user](https://github.com/ghost)). * Now, background merges, mutations and `OPTIMIZE` will not increment `SelectedRows` and `SelectedBytes` metrics. They (still) will increment `MergedRows` and `MergedUncompressedBytes` as it was before. [#37040](https://github.com/ClickHouse/ClickHouse/pull/37040) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). #### New Feature @@ -20,7 +20,7 @@ sidebar_label: 2022 * Parse collations in CREATE TABLE, throw exception or ignore. closes [#35892](https://github.com/ClickHouse/ClickHouse/issues/35892). [#36271](https://github.com/ClickHouse/ClickHouse/pull/36271) ([yuuch](https://github.com/yuuch)). * Add aliases JSONLines and NDJSON for JSONEachRow. Closes [#36303](https://github.com/ClickHouse/ClickHouse/issues/36303). [#36327](https://github.com/ClickHouse/ClickHouse/pull/36327) ([flynn](https://github.com/ucasfl)). * Set parts_to_delay_insert and parts_to_throw_insert as query-level settings. If they are defined, they can override table-level settings. [#36371](https://github.com/ClickHouse/ClickHouse/pull/36371) ([Memo](https://github.com/Joeywzr)). -* temporary table can show total rows and total bytes. [#36401](https://github.com/ClickHouse/ClickHouse/issues/36401). [#36439](https://github.com/ClickHouse/ClickHouse/pull/36439) ([xiedeyantu](https://github.com/xiedeyantu)). +* temporary table can show total rows and total bytes. [#36401](https://github.com/ClickHouse/ClickHouse/issues/36401). [#36439](https://github.com/ClickHouse/ClickHouse/pull/36439) ([chen](https://github.com/xiedeyantu)). * Added new hash function - wyHash64. [#36467](https://github.com/ClickHouse/ClickHouse/pull/36467) ([olevino](https://github.com/olevino)). * Window function nth_value was added. [#36601](https://github.com/ClickHouse/ClickHouse/pull/36601) ([Nikolay](https://github.com/ndchikin)). * Add MySQLDump input format. It reads all data from INSERT queries belonging to one table in dump. If there are more than one table, by default it reads data from the first one. [#36667](https://github.com/ClickHouse/ClickHouse/pull/36667) ([Kruglov Pavel](https://github.com/Avogar)). @@ -212,7 +212,7 @@ sidebar_label: 2022 * Update version after release [#36502](https://github.com/ClickHouse/ClickHouse/pull/36502) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). * Followup on [#36172](https://github.com/ClickHouse/ClickHouse/issues/36172) password hash salt feature [#36510](https://github.com/ClickHouse/ClickHouse/pull/36510) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). * Update version_date.tsv after v22.4.2.1-stable [#36533](https://github.com/ClickHouse/ClickHouse/pull/36533) ([github-actions[bot]](https://github.com/apps/github-actions)). -* fix log should print 'from' path [#36535](https://github.com/ClickHouse/ClickHouse/pull/36535) ([xiedeyantu](https://github.com/xiedeyantu)). +* fix log should print 'from' path [#36535](https://github.com/ClickHouse/ClickHouse/pull/36535) ([chen](https://github.com/xiedeyantu)). * Add function bin tests for Int/UInt128/UInt256 [#36537](https://github.com/ClickHouse/ClickHouse/pull/36537) ([Memo](https://github.com/Joeywzr)). * Fix 01161_all_system_tables [#36539](https://github.com/ClickHouse/ClickHouse/pull/36539) ([Antonio Andelic](https://github.com/antonio2368)). * Update PULL_REQUEST_TEMPLATE.md [#36543](https://github.com/ClickHouse/ClickHouse/pull/36543) ([Ivan Blinkov](https://github.com/blinkov)). diff --git a/docs/changelogs/v22.6.1.1985-stable.md b/docs/changelogs/v22.6.1.1985-stable.md index eeb4078eb04..9d930b8a0bb 100644 --- a/docs/changelogs/v22.6.1.1985-stable.md +++ b/docs/changelogs/v22.6.1.1985-stable.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.6.1.1985-stable FIXME as compared to v22.5.1.2079-stable +### ClickHouse release v22.6.1.1985-stable (7000c4e0033) FIXME as compared to v22.5.1.2079-stable (df0cb062098) #### Backward Incompatible Change * Changes how settings using `seconds` as type are parsed to support floating point values (for example: `max_execution_time=0.5`). Infinity or NaN values will throw an exception. [#37187](https://github.com/ClickHouse/ClickHouse/pull/37187) ([Raúl Marín](https://github.com/Algunenano)). @@ -78,7 +78,7 @@ sidebar_label: 2022 * Allow to use String type instead of Binary in Arrow/Parquet/ORC formats. This PR introduces 3 new settings for it: `output_format_arrow_string_as_string`, `output_format_parquet_string_as_string`, `output_format_orc_string_as_string`. Default value for all settings is `false`. [#37327](https://github.com/ClickHouse/ClickHouse/pull/37327) ([Kruglov Pavel](https://github.com/Avogar)). * Apply setting `input_format_max_rows_to_read_for_schema_inference` for all read rows in total from all files in globs. Previously setting `input_format_max_rows_to_read_for_schema_inference` was applied for each file in glob separately and in case of huge number of nulls we could read first `input_format_max_rows_to_read_for_schema_inference` rows from each file and get nothing. Also increase default value for this setting to 25000. [#37332](https://github.com/ClickHouse/ClickHouse/pull/37332) ([Kruglov Pavel](https://github.com/Avogar)). * allows providing `NULL`/`NOT NULL` right after type in column declaration. [#37337](https://github.com/ClickHouse/ClickHouse/pull/37337) ([Igor Nikonov](https://github.com/devcrafter)). -* optimize file segment PARTIALLY_DOWNLOADED get read buffer. [#37338](https://github.com/ClickHouse/ClickHouse/pull/37338) ([xiedeyantu](https://github.com/xiedeyantu)). +* optimize file segment PARTIALLY_DOWNLOADED get read buffer. [#37338](https://github.com/ClickHouse/ClickHouse/pull/37338) ([chen](https://github.com/xiedeyantu)). * Allow to prune the list of files via virtual columns such as `_file` and `_path` when reading from S3. This is for [#37174](https://github.com/ClickHouse/ClickHouse/issues/37174) , [#23494](https://github.com/ClickHouse/ClickHouse/issues/23494). [#37356](https://github.com/ClickHouse/ClickHouse/pull/37356) ([Amos Bird](https://github.com/amosbird)). * Try to improve short circuit functions processing to fix problems with stress tests. [#37384](https://github.com/ClickHouse/ClickHouse/pull/37384) ([Kruglov Pavel](https://github.com/Avogar)). * Closes [#37395](https://github.com/ClickHouse/ClickHouse/issues/37395). [#37415](https://github.com/ClickHouse/ClickHouse/pull/37415) ([Memo](https://github.com/Joeywzr)). @@ -117,7 +117,7 @@ sidebar_label: 2022 * Remove recursive submodules, because we don't need them and they can be confusing. Add style check to prevent recursive submodules. This closes [#32821](https://github.com/ClickHouse/ClickHouse/issues/32821). [#37616](https://github.com/ClickHouse/ClickHouse/pull/37616) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Add docs spellcheck to CI. [#37790](https://github.com/ClickHouse/ClickHouse/pull/37790) ([Vladimir C](https://github.com/vdimir)). * Fix overly aggressive stripping which removed the embedded hash required for checking the consistency of the executable. [#37993](https://github.com/ClickHouse/ClickHouse/pull/37993) ([Robert Schulze](https://github.com/rschu1ze)). -* fix MacOS build compressor faild. [#38007](https://github.com/ClickHouse/ClickHouse/pull/38007) ([xiedeyantu](https://github.com/xiedeyantu)). +* fix MacOS build compressor faild. [#38007](https://github.com/ClickHouse/ClickHouse/pull/38007) ([chen](https://github.com/xiedeyantu)). #### Bug Fix (user-visible misbehavior in official stable or prestable release) @@ -166,7 +166,7 @@ sidebar_label: 2022 * Fix possible incorrect result of `SELECT ... WITH FILL` in the case when `ORDER BY` should be applied after `WITH FILL` result (e.g. for outer query). Incorrect result was caused by optimization for `ORDER BY` expressions ([#35623](https://github.com/ClickHouse/ClickHouse/issues/35623)). Closes [#37904](https://github.com/ClickHouse/ClickHouse/issues/37904). [#37959](https://github.com/ClickHouse/ClickHouse/pull/37959) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). * Add missing default columns when pushing to the target table in WindowView, fix [#37815](https://github.com/ClickHouse/ClickHouse/issues/37815). [#37965](https://github.com/ClickHouse/ClickHouse/pull/37965) ([vxider](https://github.com/Vxider)). * Fixed a stack overflow issue that would cause compilation to fail. [#37996](https://github.com/ClickHouse/ClickHouse/pull/37996) ([Han Shukai](https://github.com/KinderRiven)). -* when open enable_filesystem_query_cache_limit, throw Reserved cache size exceeds the remaining cache size. [#38004](https://github.com/ClickHouse/ClickHouse/pull/38004) ([xiedeyantu](https://github.com/xiedeyantu)). +* when open enable_filesystem_query_cache_limit, throw Reserved cache size exceeds the remaining cache size. [#38004](https://github.com/ClickHouse/ClickHouse/pull/38004) ([chen](https://github.com/xiedeyantu)). * Query, containing ORDER BY ... WITH FILL, can generate extra rows when multiple WITH FILL columns are present. [#38074](https://github.com/ClickHouse/ClickHouse/pull/38074) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). #### Bug Fix (user-visible misbehaviour in official stable or prestable release) diff --git a/docs/changelogs/v22.6.2.12-stable.md b/docs/changelogs/v22.6.2.12-stable.md index 224367b994a..a97492ffe27 100644 --- a/docs/changelogs/v22.6.2.12-stable.md +++ b/docs/changelogs/v22.6.2.12-stable.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.6.2.12-stable FIXME as compared to v22.6.1.1985-stable +### ClickHouse release v22.6.2.12-stable (1fc97f10cbf) FIXME as compared to v22.6.1.1985-stable (7000c4e0033) #### Improvement * Backported in [#38484](https://github.com/ClickHouse/ClickHouse/issues/38484): Improve the stability for hive storage integration test. Move the data prepare step into test.py. [#38260](https://github.com/ClickHouse/ClickHouse/pull/38260) ([lgbo](https://github.com/lgbo-ustc)). diff --git a/docs/changelogs/v22.6.3.35-stable.md b/docs/changelogs/v22.6.3.35-stable.md index 584aeafc48e..e3eee326915 100644 --- a/docs/changelogs/v22.6.3.35-stable.md +++ b/docs/changelogs/v22.6.3.35-stable.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.6.3.35-stable FIXME as compared to v22.6.2.12-stable +### ClickHouse release v22.6.3.35-stable (d5566f2f2dd) FIXME as compared to v22.6.2.12-stable (1fc97f10cbf) #### Bug Fix * Backported in [#38812](https://github.com/ClickHouse/ClickHouse/issues/38812): Fix crash when executing GRANT ALL ON *.* with ON CLUSTER. It was broken in https://github.com/ClickHouse/ClickHouse/pull/35767. This closes [#38618](https://github.com/ClickHouse/ClickHouse/issues/38618). [#38674](https://github.com/ClickHouse/ClickHouse/pull/38674) ([Vitaly Baranov](https://github.com/vitlibar)). diff --git a/docs/changelogs/v22.6.4.35-stable.md b/docs/changelogs/v22.6.4.35-stable.md index d70d20d6134..b6c63d94eab 100644 --- a/docs/changelogs/v22.6.4.35-stable.md +++ b/docs/changelogs/v22.6.4.35-stable.md @@ -5,7 +5,7 @@ sidebar_label: 2022 # 2022 Changelog -### ClickHouse release v22.6.4.35-stable FIXME as compared to v22.6.3.35-stable +### ClickHouse release v22.6.4.35-stable (b9202cae6f4) FIXME as compared to v22.6.3.35-stable (d5566f2f2dd) #### Build/Testing/Packaging Improvement * Backported in [#38822](https://github.com/ClickHouse/ClickHouse/issues/38822): - Change `all|noarch` packages to architecture-dependent - Fix some documentation for it - Push aarch64|arm64 packages to artifactory and release assets - Fixes [#36443](https://github.com/ClickHouse/ClickHouse/issues/36443). [#38580](https://github.com/ClickHouse/ClickHouse/pull/38580) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 3cb248efa55..0ab5349b97b 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v22.7.3.5-stable 2022-08-10 v22.7.2.15-stable 2022-08-03 v22.7.1.2484-stable 2022-07-21 v22.6.5.22-stable 2022-08-09 @@ -5,6 +6,7 @@ v22.6.4.35-stable 2022-07-25 v22.6.3.35-stable 2022-07-06 v22.6.2.12-stable 2022-06-29 v22.6.1.1985-stable 2022-06-16 +v22.5.4.19-stable 2022-08-10 v22.5.3.21-stable 2022-07-25 v22.5.2.53-stable 2022-07-07 v22.5.1.2079-stable 2022-05-19 @@ -13,6 +15,7 @@ v22.4.5.9-stable 2022-05-06 v22.4.4.7-stable 2022-04-29 v22.4.3.3-stable 2022-04-26 v22.4.2.1-stable 2022-04-22 +v22.3.11.12-lts 2022-08-10 v22.3.10.22-lts 2022-08-03 v22.3.9.19-lts 2022-07-25 v22.3.8.39-lts 2022-07-07 From f47873769e620d896442ccb9220acdc2c32b1525 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 11 Aug 2022 00:18:08 +0200 Subject: [PATCH 147/164] Add omitted changelogs --- docs/changelogs/v22.3.10.22-lts.md | 30 +++++++++++++++++++ docs/changelogs/v22.3.11.12-lts.md | 20 +++++++++++++ docs/changelogs/v22.3.8.39-lts.md | 32 ++++++++++++++++++++ docs/changelogs/v22.3.9.19-lts.md | 24 +++++++++++++++ docs/changelogs/v22.4.6.53-stable.md | 44 ++++++++++++++++++++++++++++ docs/changelogs/v22.5.2.53-stable.md | 40 +++++++++++++++++++++++++ docs/changelogs/v22.5.3.21-stable.md | 24 +++++++++++++++ docs/changelogs/v22.5.4.19-stable.md | 29 ++++++++++++++++++ docs/changelogs/v22.7.3.5-stable.md | 18 ++++++++++++ 9 files changed, 261 insertions(+) create mode 100644 docs/changelogs/v22.3.10.22-lts.md create mode 100644 docs/changelogs/v22.3.11.12-lts.md create mode 100644 docs/changelogs/v22.3.8.39-lts.md create mode 100644 docs/changelogs/v22.3.9.19-lts.md create mode 100644 docs/changelogs/v22.4.6.53-stable.md create mode 100644 docs/changelogs/v22.5.2.53-stable.md create mode 100644 docs/changelogs/v22.5.3.21-stable.md create mode 100644 docs/changelogs/v22.5.4.19-stable.md create mode 100644 docs/changelogs/v22.7.3.5-stable.md diff --git a/docs/changelogs/v22.3.10.22-lts.md b/docs/changelogs/v22.3.10.22-lts.md new file mode 100644 index 00000000000..48009cb4f67 --- /dev/null +++ b/docs/changelogs/v22.3.10.22-lts.md @@ -0,0 +1,30 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.3.10.22-lts (25886f517d4) FIXME as compared to v22.3.9.19-lts (7976930b82e) + +#### Bug Fix +* Backported in [#39761](https://github.com/ClickHouse/ClickHouse/issues/39761): Fix seeking while reading from encrypted disk. This PR fixes [#38381](https://github.com/ClickHouse/ClickHouse/issues/38381). [#39687](https://github.com/ClickHouse/ClickHouse/pull/39687) ([Vitaly Baranov](https://github.com/vitlibar)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#39206](https://github.com/ClickHouse/ClickHouse/issues/39206): Fix reading of sparse columns from `MergeTree` tables that store their data in S3. [#37978](https://github.com/ClickHouse/ClickHouse/pull/37978) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#39381](https://github.com/ClickHouse/ClickHouse/issues/39381): Fixed error `Not found column Type in block` in selects with `PREWHERE` and read-in-order optimizations. [#39157](https://github.com/ClickHouse/ClickHouse/pull/39157) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Backported in [#39588](https://github.com/ClickHouse/ClickHouse/issues/39588): Fix data race and possible heap-buffer-overflow in Avro format. Closes [#39094](https://github.com/ClickHouse/ClickHouse/issues/39094) Closes [#33652](https://github.com/ClickHouse/ClickHouse/issues/33652). [#39498](https://github.com/ClickHouse/ClickHouse/pull/39498) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#39610](https://github.com/ClickHouse/ClickHouse/issues/39610): Fix bug with maxsplit argument for splitByChar, which was not working correctly. [#39552](https://github.com/ClickHouse/ClickHouse/pull/39552) ([filimonov](https://github.com/filimonov)). +* Backported in [#39834](https://github.com/ClickHouse/ClickHouse/issues/39834): Fix `CANNOT_READ_ALL_DATA` exception with `local_filesystem_read_method=pread_threadpool`. This bug affected only Linux kernel version 5.9 and 5.10 according to [man](https://manpages.debian.org/testing/manpages-dev/preadv2.2.en.html#BUGS). [#39800](https://github.com/ClickHouse/ClickHouse/pull/39800) ([Anton Popov](https://github.com/CurtizJ)). + +#### Bug Fix (user-visible misbehaviour in official stable or prestable release) + +* Backported in [#39238](https://github.com/ClickHouse/ClickHouse/issues/39238): Fix performance regression of scalar query optimization. [#35986](https://github.com/ClickHouse/ClickHouse/pull/35986) ([Amos Bird](https://github.com/amosbird)). +* Backported in [#39531](https://github.com/ClickHouse/ClickHouse/issues/39531): Fix some issues with async reads from remote filesystem which happened when reading low cardinality. [#36763](https://github.com/ClickHouse/ClickHouse/pull/36763) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Replace MemoryTrackerBlockerInThread to LockMemoryExceptionInThread [#39619](https://github.com/ClickHouse/ClickHouse/pull/39619) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Change mysql-odbc url [#39702](https://github.com/ClickHouse/ClickHouse/pull/39702) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.3.11.12-lts.md b/docs/changelogs/v22.3.11.12-lts.md new file mode 100644 index 00000000000..b10de69e234 --- /dev/null +++ b/docs/changelogs/v22.3.11.12-lts.md @@ -0,0 +1,20 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.3.11.12-lts (137c5f72657) FIXME as compared to v22.3.10.22-lts (25886f517d4) + +#### Build/Testing/Packaging Improvement +* Backported in [#39881](https://github.com/ClickHouse/ClickHouse/issues/39881): Former packages used to install systemd.service file to `/etc`. The files there are marked as `conf` and are not cleaned out, and not updated automatically. This PR cleans them out. [#39323](https://github.com/ClickHouse/ClickHouse/pull/39323) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#39336](https://github.com/ClickHouse/ClickHouse/issues/39336): Fix `parallel_view_processing=1` with `optimize_trivial_insert_select=1`. Fix `max_insert_threads` while pushing to views. [#38731](https://github.com/ClickHouse/ClickHouse/pull/38731) ([Azat Khuzhin](https://github.com/azat)). + +#### NO CL ENTRY + +* NO CL ENTRY: 'Revert "Backport [#39687](https://github.com/ClickHouse/ClickHouse/issues/39687) to 22.3: Fix seeking while reading from encrypted disk"'. [#40052](https://github.com/ClickHouse/ClickHouse/pull/40052) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + diff --git a/docs/changelogs/v22.3.8.39-lts.md b/docs/changelogs/v22.3.8.39-lts.md new file mode 100644 index 00000000000..893e8762e9c --- /dev/null +++ b/docs/changelogs/v22.3.8.39-lts.md @@ -0,0 +1,32 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.3.8.39-lts (6bcf982f58b) FIXME as compared to v22.3.7.28-lts (420bdfa2751) + +#### Build/Testing/Packaging Improvement +* Backported in [#38826](https://github.com/ClickHouse/ClickHouse/issues/38826): - Change `all|noarch` packages to architecture-dependent - Fix some documentation for it - Push aarch64|arm64 packages to artifactory and release assets - Fixes [#36443](https://github.com/ClickHouse/ClickHouse/issues/36443). [#38580](https://github.com/ClickHouse/ClickHouse/pull/38580) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#38453](https://github.com/ClickHouse/ClickHouse/issues/38453): Fix bug with nested short-circuit functions that led to execution of arguments even if condition is false. Closes [#38040](https://github.com/ClickHouse/ClickHouse/issues/38040). [#38173](https://github.com/ClickHouse/ClickHouse/pull/38173) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#38710](https://github.com/ClickHouse/ClickHouse/issues/38710): Fix incorrect result of distributed queries with `DISTINCT` and `LIMIT`. Fixes [#38282](https://github.com/ClickHouse/ClickHouse/issues/38282). [#38371](https://github.com/ClickHouse/ClickHouse/pull/38371) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#38689](https://github.com/ClickHouse/ClickHouse/issues/38689): Now it's possible to start a clickhouse-server and attach/detach tables even for tables with the incorrect values of IPv4/IPv6 representation. Proper fix for issue [#35156](https://github.com/ClickHouse/ClickHouse/issues/35156). [#38590](https://github.com/ClickHouse/ClickHouse/pull/38590) ([alesapin](https://github.com/alesapin)). +* Backported in [#38776](https://github.com/ClickHouse/ClickHouse/issues/38776): `rankCorr` function will work correctly if some arguments are NaNs. This closes [#38396](https://github.com/ClickHouse/ClickHouse/issues/38396). [#38722](https://github.com/ClickHouse/ClickHouse/pull/38722) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#38780](https://github.com/ClickHouse/ClickHouse/issues/38780): Fix use-after-free for Map combinator that leads to incorrect result. [#38748](https://github.com/ClickHouse/ClickHouse/pull/38748) ([Azat Khuzhin](https://github.com/azat)). + +#### Bug Fix (user-visible misbehaviour in official stable or prestable release) + +* Backported in [#36818](https://github.com/ClickHouse/ClickHouse/issues/36818): Fix projection analysis which might lead to wrong query result when IN subquery is used. This fixes [#35336](https://github.com/ClickHouse/ClickHouse/issues/35336). [#35631](https://github.com/ClickHouse/ClickHouse/pull/35631) ([Amos Bird](https://github.com/amosbird)). +* Backported in [#38467](https://github.com/ClickHouse/ClickHouse/issues/38467): - Fix potential error with literals in `WHERE` for join queries. Close [#36279](https://github.com/ClickHouse/ClickHouse/issues/36279). [#36542](https://github.com/ClickHouse/ClickHouse/pull/36542) ([Vladimir C](https://github.com/vdimir)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Try to fix some trash [#37303](https://github.com/ClickHouse/ClickHouse/pull/37303) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Update docker-compose to try get rid of v1 errors [#38394](https://github.com/ClickHouse/ClickHouse/pull/38394) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Trying backport useful features for CI [#38510](https://github.com/ClickHouse/ClickHouse/pull/38510) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix backports diff [#38703](https://github.com/ClickHouse/ClickHouse/pull/38703) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.3.9.19-lts.md b/docs/changelogs/v22.3.9.19-lts.md new file mode 100644 index 00000000000..c00b9bfc8eb --- /dev/null +++ b/docs/changelogs/v22.3.9.19-lts.md @@ -0,0 +1,24 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.3.9.19-lts (7976930b82e) FIXME as compared to v22.3.8.39-lts (6bcf982f58b) + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#39097](https://github.com/ClickHouse/ClickHouse/issues/39097): Any allocations inside OvercommitTracker may lead to deadlock. Logging was not very informative so it's easier just to remove logging. Fixes [#37794](https://github.com/ClickHouse/ClickHouse/issues/37794). [#39030](https://github.com/ClickHouse/ClickHouse/pull/39030) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#39080](https://github.com/ClickHouse/ClickHouse/issues/39080): Fix bug in filesystem cache that could happen in some corner case which coincided with cache capacity hitting the limit. Closes [#39066](https://github.com/ClickHouse/ClickHouse/issues/39066). [#39070](https://github.com/ClickHouse/ClickHouse/pull/39070) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#39149](https://github.com/ClickHouse/ClickHouse/issues/39149): Fix error `Block structure mismatch` which could happen for INSERT into table with attached MATERIALIZED VIEW and enabled setting `extremes = 1`. Closes [#29759](https://github.com/ClickHouse/ClickHouse/issues/29759) and [#38729](https://github.com/ClickHouse/ClickHouse/issues/38729). [#39125](https://github.com/ClickHouse/ClickHouse/pull/39125) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#39372](https://github.com/ClickHouse/ClickHouse/issues/39372): Declare RabbitMQ queue without default arguments `x-max-length` and `x-overflow`. [#39259](https://github.com/ClickHouse/ClickHouse/pull/39259) ([rnbondarenko](https://github.com/rnbondarenko)). +* Backported in [#39379](https://github.com/ClickHouse/ClickHouse/issues/39379): Fix segmentation fault in MaterializedPostgreSQL database engine, which could happen if some exception occurred at replication initialisation. Closes [#36939](https://github.com/ClickHouse/ClickHouse/issues/36939). [#39272](https://github.com/ClickHouse/ClickHouse/pull/39272) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#39351](https://github.com/ClickHouse/ClickHouse/issues/39351): Fix incorrect fetch postgresql tables query fro PostgreSQL database engine. Closes [#33502](https://github.com/ClickHouse/ClickHouse/issues/33502). [#39283](https://github.com/ClickHouse/ClickHouse/pull/39283) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Reproduce and a little bit better fix for LC dict right offset. [#36856](https://github.com/ClickHouse/ClickHouse/pull/36856) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Retry docker buildx commands with progressive sleep in between [#38898](https://github.com/ClickHouse/ClickHouse/pull/38898) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add docker_server.py running to backport and release CIs [#39011](https://github.com/ClickHouse/ClickHouse/pull/39011) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.4.6.53-stable.md b/docs/changelogs/v22.4.6.53-stable.md new file mode 100644 index 00000000000..cdd4c885574 --- /dev/null +++ b/docs/changelogs/v22.4.6.53-stable.md @@ -0,0 +1,44 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.4.6.53-stable (0625731c940) FIXME as compared to v22.4.5.9-stable (059ef6cadcd) + +#### New Feature +* Backported in [#38714](https://github.com/ClickHouse/ClickHouse/issues/38714): SALT is allowed for CREATE USER IDENTIFIED WITH sha256_hash. [#37377](https://github.com/ClickHouse/ClickHouse/pull/37377) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). + +#### Build/Testing/Packaging Improvement +* Backported in [#38828](https://github.com/ClickHouse/ClickHouse/issues/38828): - Change `all|noarch` packages to architecture-dependent - Fix some documentation for it - Push aarch64|arm64 packages to artifactory and release assets - Fixes [#36443](https://github.com/ClickHouse/ClickHouse/issues/36443). [#38580](https://github.com/ClickHouse/ClickHouse/pull/38580) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#37717](https://github.com/ClickHouse/ClickHouse/issues/37717): Fix unexpected errors with a clash of constant strings in aggregate function, prewhere and join. Close [#36891](https://github.com/ClickHouse/ClickHouse/issues/36891). [#37336](https://github.com/ClickHouse/ClickHouse/pull/37336) ([Vladimir C](https://github.com/vdimir)). +* Backported in [#37512](https://github.com/ClickHouse/ClickHouse/issues/37512): Fix logical error in normalizeUTF8 functions. Closes [#37298](https://github.com/ClickHouse/ClickHouse/issues/37298). [#37443](https://github.com/ClickHouse/ClickHouse/pull/37443) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#37941](https://github.com/ClickHouse/ClickHouse/issues/37941): Fix setting cast_ipv4_ipv6_default_on_conversion_error for internal cast function. Closes [#35156](https://github.com/ClickHouse/ClickHouse/issues/35156). [#37761](https://github.com/ClickHouse/ClickHouse/pull/37761) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#38452](https://github.com/ClickHouse/ClickHouse/issues/38452): Fix bug with nested short-circuit functions that led to execution of arguments even if condition is false. Closes [#38040](https://github.com/ClickHouse/ClickHouse/issues/38040). [#38173](https://github.com/ClickHouse/ClickHouse/pull/38173) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#38711](https://github.com/ClickHouse/ClickHouse/issues/38711): Fix incorrect result of distributed queries with `DISTINCT` and `LIMIT`. Fixes [#38282](https://github.com/ClickHouse/ClickHouse/issues/38282). [#38371](https://github.com/ClickHouse/ClickHouse/pull/38371) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#38593](https://github.com/ClickHouse/ClickHouse/issues/38593): Fix parts removal (will be left forever if they had not been removed on server shutdown) after incorrect server shutdown. [#38486](https://github.com/ClickHouse/ClickHouse/pull/38486) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#38596](https://github.com/ClickHouse/ClickHouse/issues/38596): Fix table creation to avoid replication issues with pre-22.4 replicas. [#38541](https://github.com/ClickHouse/ClickHouse/pull/38541) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#38686](https://github.com/ClickHouse/ClickHouse/issues/38686): Now it's possible to start a clickhouse-server and attach/detach tables even for tables with the incorrect values of IPv4/IPv6 representation. Proper fix for issue [#35156](https://github.com/ClickHouse/ClickHouse/issues/35156). [#38590](https://github.com/ClickHouse/ClickHouse/pull/38590) ([alesapin](https://github.com/alesapin)). +* Backported in [#38663](https://github.com/ClickHouse/ClickHouse/issues/38663): Adapt some more nodes to avoid issues with pre-22.4 replicas. [#38627](https://github.com/ClickHouse/ClickHouse/pull/38627) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#38777](https://github.com/ClickHouse/ClickHouse/issues/38777): `rankCorr` function will work correctly if some arguments are NaNs. This closes [#38396](https://github.com/ClickHouse/ClickHouse/issues/38396). [#38722](https://github.com/ClickHouse/ClickHouse/pull/38722) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#38781](https://github.com/ClickHouse/ClickHouse/issues/38781): Fix use-after-free for Map combinator that leads to incorrect result. [#38748](https://github.com/ClickHouse/ClickHouse/pull/38748) ([Azat Khuzhin](https://github.com/azat)). + +#### Bug Fix (user-visible misbehaviour in official stable or prestable release) + +* Backported in [#37456](https://github.com/ClickHouse/ClickHouse/issues/37456): Server might fail to start if it cannot resolve hostname of external ClickHouse dictionary. It's fixed. Fixes [#36451](https://github.com/ClickHouse/ClickHouse/issues/36451). [#36463](https://github.com/ClickHouse/ClickHouse/pull/36463) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Backported in [#38468](https://github.com/ClickHouse/ClickHouse/issues/38468): - Fix potential error with literals in `WHERE` for join queries. Close [#36279](https://github.com/ClickHouse/ClickHouse/issues/36279). [#36542](https://github.com/ClickHouse/ClickHouse/pull/36542) ([Vladimir C](https://github.com/vdimir)). +* Backported in [#37363](https://github.com/ClickHouse/ClickHouse/issues/37363): Fixed problem with infs in `quantileTDigest`. Fixes [#32107](https://github.com/ClickHouse/ClickHouse/issues/32107). [#37021](https://github.com/ClickHouse/ClickHouse/pull/37021) ([Vladimir Chebotarev](https://github.com/excitoon)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Integration tests [#36866](https://github.com/ClickHouse/ClickHouse/pull/36866) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Try to fix some trash [#37303](https://github.com/ClickHouse/ClickHouse/pull/37303) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Update protobuf files for kafka and rabbitmq [fix integration tests] [#37884](https://github.com/ClickHouse/ClickHouse/pull/37884) ([Nikita Taranov](https://github.com/nickitat)). +* Try fix `test_grpc_protocol/test.py::test_progress` [#37908](https://github.com/ClickHouse/ClickHouse/pull/37908) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Update docker-compose to try get rid of v1 errors [#38394](https://github.com/ClickHouse/ClickHouse/pull/38394) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix backports diff [#38703](https://github.com/ClickHouse/ClickHouse/pull/38703) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.5.2.53-stable.md b/docs/changelogs/v22.5.2.53-stable.md new file mode 100644 index 00000000000..10b64632680 --- /dev/null +++ b/docs/changelogs/v22.5.2.53-stable.md @@ -0,0 +1,40 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.5.2.53-stable (5fd600fda9e) FIXME as compared to v22.5.1.2079-stable (df0cb062098) + +#### New Feature +* Backported in [#38713](https://github.com/ClickHouse/ClickHouse/issues/38713): SALT is allowed for CREATE USER IDENTIFIED WITH sha256_hash. [#37377](https://github.com/ClickHouse/ClickHouse/pull/37377) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). + +#### Build/Testing/Packaging Improvement +* Backported in [#38827](https://github.com/ClickHouse/ClickHouse/issues/38827): - Change `all|noarch` packages to architecture-dependent - Fix some documentation for it - Push aarch64|arm64 packages to artifactory and release assets - Fixes [#36443](https://github.com/ClickHouse/ClickHouse/issues/36443). [#38580](https://github.com/ClickHouse/ClickHouse/pull/38580) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#37716](https://github.com/ClickHouse/ClickHouse/issues/37716): Fix unexpected errors with a clash of constant strings in aggregate function, prewhere and join. Close [#36891](https://github.com/ClickHouse/ClickHouse/issues/36891). [#37336](https://github.com/ClickHouse/ClickHouse/pull/37336) ([Vladimir C](https://github.com/vdimir)). +* Backported in [#37408](https://github.com/ClickHouse/ClickHouse/issues/37408): Throw an exception when GROUPING SETS used with ROLLUP or CUBE. [#37367](https://github.com/ClickHouse/ClickHouse/pull/37367) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#37513](https://github.com/ClickHouse/ClickHouse/issues/37513): Fix logical error in normalizeUTF8 functions. Closes [#37298](https://github.com/ClickHouse/ClickHouse/issues/37298). [#37443](https://github.com/ClickHouse/ClickHouse/pull/37443) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#37942](https://github.com/ClickHouse/ClickHouse/issues/37942): Fix setting cast_ipv4_ipv6_default_on_conversion_error for internal cast function. Closes [#35156](https://github.com/ClickHouse/ClickHouse/issues/35156). [#37761](https://github.com/ClickHouse/ClickHouse/pull/37761) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#38451](https://github.com/ClickHouse/ClickHouse/issues/38451): Fix bug with nested short-circuit functions that led to execution of arguments even if condition is false. Closes [#38040](https://github.com/ClickHouse/ClickHouse/issues/38040). [#38173](https://github.com/ClickHouse/ClickHouse/pull/38173) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#38544](https://github.com/ClickHouse/ClickHouse/issues/38544): Do not allow recursive usage of OvercommitTracker during logging. Fixes [#37794](https://github.com/ClickHouse/ClickHouse/issues/37794) cc @tavplubix @davenger. [#38246](https://github.com/ClickHouse/ClickHouse/pull/38246) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#38708](https://github.com/ClickHouse/ClickHouse/issues/38708): Fix incorrect result of distributed queries with `DISTINCT` and `LIMIT`. Fixes [#38282](https://github.com/ClickHouse/ClickHouse/issues/38282). [#38371](https://github.com/ClickHouse/ClickHouse/pull/38371) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#38595](https://github.com/ClickHouse/ClickHouse/issues/38595): Fix parts removal (will be left forever if they had not been removed on server shutdown) after incorrect server shutdown. [#38486](https://github.com/ClickHouse/ClickHouse/pull/38486) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#38598](https://github.com/ClickHouse/ClickHouse/issues/38598): Fix table creation to avoid replication issues with pre-22.4 replicas. [#38541](https://github.com/ClickHouse/ClickHouse/pull/38541) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#38688](https://github.com/ClickHouse/ClickHouse/issues/38688): Now it's possible to start a clickhouse-server and attach/detach tables even for tables with the incorrect values of IPv4/IPv6 representation. Proper fix for issue [#35156](https://github.com/ClickHouse/ClickHouse/issues/35156). [#38590](https://github.com/ClickHouse/ClickHouse/pull/38590) ([alesapin](https://github.com/alesapin)). +* Backported in [#38664](https://github.com/ClickHouse/ClickHouse/issues/38664): Adapt some more nodes to avoid issues with pre-22.4 replicas. [#38627](https://github.com/ClickHouse/ClickHouse/pull/38627) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#38779](https://github.com/ClickHouse/ClickHouse/issues/38779): `rankCorr` function will work correctly if some arguments are NaNs. This closes [#38396](https://github.com/ClickHouse/ClickHouse/issues/38396). [#38722](https://github.com/ClickHouse/ClickHouse/pull/38722) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#38783](https://github.com/ClickHouse/ClickHouse/issues/38783): Fix use-after-free for Map combinator that leads to incorrect result. [#38748](https://github.com/ClickHouse/ClickHouse/pull/38748) ([Azat Khuzhin](https://github.com/azat)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Try to fix some trash [#37303](https://github.com/ClickHouse/ClickHouse/pull/37303) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Update protobuf files for kafka and rabbitmq [fix integration tests] [#37884](https://github.com/ClickHouse/ClickHouse/pull/37884) ([Nikita Taranov](https://github.com/nickitat)). +* Try fix `test_grpc_protocol/test.py::test_progress` [#37908](https://github.com/ClickHouse/ClickHouse/pull/37908) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Try to fix BC check [#38178](https://github.com/ClickHouse/ClickHouse/pull/38178) ([Kruglov Pavel](https://github.com/Avogar)). +* Update docker-compose to try get rid of v1 errors [#38394](https://github.com/ClickHouse/ClickHouse/pull/38394) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix backports diff [#38703](https://github.com/ClickHouse/ClickHouse/pull/38703) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.5.3.21-stable.md b/docs/changelogs/v22.5.3.21-stable.md new file mode 100644 index 00000000000..7c4717575d8 --- /dev/null +++ b/docs/changelogs/v22.5.3.21-stable.md @@ -0,0 +1,24 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.5.3.21-stable (e03724efec5) FIXME as compared to v22.5.2.53-stable (5fd600fda9e) + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#38241](https://github.com/ClickHouse/ClickHouse/issues/38241): Fix possible crash in `Distributed` async insert in case of removing a replica from config. [#38029](https://github.com/ClickHouse/ClickHouse/pull/38029) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#39098](https://github.com/ClickHouse/ClickHouse/issues/39098): Any allocations inside OvercommitTracker may lead to deadlock. Logging was not very informative so it's easier just to remove logging. Fixes [#37794](https://github.com/ClickHouse/ClickHouse/issues/37794). [#39030](https://github.com/ClickHouse/ClickHouse/pull/39030) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#39078](https://github.com/ClickHouse/ClickHouse/issues/39078): Fix bug in filesystem cache that could happen in some corner case which coincided with cache capacity hitting the limit. Closes [#39066](https://github.com/ClickHouse/ClickHouse/issues/39066). [#39070](https://github.com/ClickHouse/ClickHouse/pull/39070) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#39152](https://github.com/ClickHouse/ClickHouse/issues/39152): Fix error `Block structure mismatch` which could happen for INSERT into table with attached MATERIALIZED VIEW and enabled setting `extremes = 1`. Closes [#29759](https://github.com/ClickHouse/ClickHouse/issues/29759) and [#38729](https://github.com/ClickHouse/ClickHouse/issues/38729). [#39125](https://github.com/ClickHouse/ClickHouse/pull/39125) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#39274](https://github.com/ClickHouse/ClickHouse/issues/39274): Fixed error `Not found column Type in block` in selects with `PREWHERE` and read-in-order optimizations. [#39157](https://github.com/ClickHouse/ClickHouse/pull/39157) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Backported in [#39369](https://github.com/ClickHouse/ClickHouse/issues/39369): Declare RabbitMQ queue without default arguments `x-max-length` and `x-overflow`. [#39259](https://github.com/ClickHouse/ClickHouse/pull/39259) ([rnbondarenko](https://github.com/rnbondarenko)). +* Backported in [#39350](https://github.com/ClickHouse/ClickHouse/issues/39350): Fix incorrect fetch postgresql tables query fro PostgreSQL database engine. Closes [#33502](https://github.com/ClickHouse/ClickHouse/issues/33502). [#39283](https://github.com/ClickHouse/ClickHouse/pull/39283) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Retry docker buildx commands with progressive sleep in between [#38898](https://github.com/ClickHouse/ClickHouse/pull/38898) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add docker_server.py running to backport and release CIs [#39011](https://github.com/ClickHouse/ClickHouse/pull/39011) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.5.4.19-stable.md b/docs/changelogs/v22.5.4.19-stable.md new file mode 100644 index 00000000000..05296a178eb --- /dev/null +++ b/docs/changelogs/v22.5.4.19-stable.md @@ -0,0 +1,29 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.5.4.19-stable (c893bba830e) FIXME as compared to v22.5.3.21-stable (e03724efec5) + +#### Bug Fix +* Backported in [#39748](https://github.com/ClickHouse/ClickHouse/issues/39748): Fix seeking while reading from encrypted disk. This PR fixes [#38381](https://github.com/ClickHouse/ClickHouse/issues/38381). [#39687](https://github.com/ClickHouse/ClickHouse/pull/39687) ([Vitaly Baranov](https://github.com/vitlibar)). + +#### Build/Testing/Packaging Improvement +* Backported in [#39882](https://github.com/ClickHouse/ClickHouse/issues/39882): Former packages used to install systemd.service file to `/etc`. The files there are marked as `conf` and are not cleaned out, and not updated automatically. This PR cleans them out. [#39323](https://github.com/ClickHouse/ClickHouse/pull/39323) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#39209](https://github.com/ClickHouse/ClickHouse/issues/39209): Fix reading of sparse columns from `MergeTree` tables that store their data in S3. [#37978](https://github.com/ClickHouse/ClickHouse/pull/37978) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#39589](https://github.com/ClickHouse/ClickHouse/issues/39589): Fix data race and possible heap-buffer-overflow in Avro format. Closes [#39094](https://github.com/ClickHouse/ClickHouse/issues/39094) Closes [#33652](https://github.com/ClickHouse/ClickHouse/issues/33652). [#39498](https://github.com/ClickHouse/ClickHouse/pull/39498) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#39611](https://github.com/ClickHouse/ClickHouse/issues/39611): Fix bug with maxsplit argument for splitByChar, which was not working correctly. [#39552](https://github.com/ClickHouse/ClickHouse/pull/39552) ([filimonov](https://github.com/filimonov)). +* Backported in [#39790](https://github.com/ClickHouse/ClickHouse/issues/39790): Fix wrong index analysis with tuples and operator `IN`, which could lead to wrong query result. [#39752](https://github.com/ClickHouse/ClickHouse/pull/39752) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#39835](https://github.com/ClickHouse/ClickHouse/issues/39835): Fix `CANNOT_READ_ALL_DATA` exception with `local_filesystem_read_method=pread_threadpool`. This bug affected only Linux kernel version 5.9 and 5.10 according to [man](https://manpages.debian.org/testing/manpages-dev/preadv2.2.en.html#BUGS). [#39800](https://github.com/ClickHouse/ClickHouse/pull/39800) ([Anton Popov](https://github.com/CurtizJ)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Fix reading from s3 in some corner cases [#38239](https://github.com/ClickHouse/ClickHouse/pull/38239) ([Anton Popov](https://github.com/CurtizJ)). +* Replace MemoryTrackerBlockerInThread to LockMemoryExceptionInThread [#39619](https://github.com/ClickHouse/ClickHouse/pull/39619) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Change mysql-odbc url [#39702](https://github.com/ClickHouse/ClickHouse/pull/39702) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + diff --git a/docs/changelogs/v22.7.3.5-stable.md b/docs/changelogs/v22.7.3.5-stable.md new file mode 100644 index 00000000000..92fe37e4821 --- /dev/null +++ b/docs/changelogs/v22.7.3.5-stable.md @@ -0,0 +1,18 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.7.3.5-stable (e140b8b5f3a) FIXME as compared to v22.7.2.15-stable (f843089624e) + +#### Build/Testing/Packaging Improvement +* Backported in [#39884](https://github.com/ClickHouse/ClickHouse/issues/39884): Former packages used to install systemd.service file to `/etc`. The files there are marked as `conf` and are not cleaned out, and not updated automatically. This PR cleans them out. [#39323](https://github.com/ClickHouse/ClickHouse/pull/39323) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#39884](https://github.com/ClickHouse/ClickHouse/issues/39884): Former packages used to install systemd.service file to `/etc`. The files there are marked as `conf` and are not cleaned out, and not updated automatically. This PR cleans them out. [#39323](https://github.com/ClickHouse/ClickHouse/pull/39323) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#40045](https://github.com/ClickHouse/ClickHouse/issues/40045): Fix big memory usage during fetches. Fixes [#39915](https://github.com/ClickHouse/ClickHouse/issues/39915). [#39990](https://github.com/ClickHouse/ClickHouse/pull/39990) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#40045](https://github.com/ClickHouse/ClickHouse/issues/40045): Fix big memory usage during fetches. Fixes [#39915](https://github.com/ClickHouse/ClickHouse/issues/39915). [#39990](https://github.com/ClickHouse/ClickHouse/pull/39990) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + From b268e4206a806f83d2fc9589a96a8269e5f7f74d Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 10 Aug 2022 20:09:43 -0400 Subject: [PATCH 148/164] remove indent as it causes codeblock --- docs/en/sql-reference/table-functions/index.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/en/sql-reference/table-functions/index.md b/docs/en/sql-reference/table-functions/index.md index a51312324f0..95c0d2f8494 100644 --- a/docs/en/sql-reference/table-functions/index.md +++ b/docs/en/sql-reference/table-functions/index.md @@ -9,13 +9,13 @@ Table functions are methods for constructing tables. You can use table functions in: -- [FROM](../../sql-reference/statements/select/from.md) clause of the `SELECT` query. +- [FROM](../../sql-reference/statements/select/from.md) clause of the `SELECT` query. - The method for creating a temporary table that is available only in the current query. The table is deleted when the query finishes. + The method for creating a temporary table that is available only in the current query. The table is deleted when the query finishes. - [CREATE TABLE AS table_function()](../../sql-reference/statements/create/table.md) query. - It's one of the methods of creating a table. + It's one of the methods of creating a table. - [INSERT INTO TABLE FUNCTION](../../sql-reference/statements/insert-into.md#inserting-into-table-function) query. @@ -38,4 +38,3 @@ You can’t use table functions if the [allow_ddl](../../operations/settings/per | [s3](../../sql-reference/table-functions/s3.md) | Creates a [S3](../../engines/table-engines/integrations/s3.md)-engine table. | | [sqlite](../../sql-reference/table-functions/sqlite.md) | Creates a [sqlite](../../engines/table-engines/integrations/sqlite.md)-engine table. | -[Original article](https://clickhouse.com/docs/en/sql-reference/table-functions/) From 84cd867aa8e5ddc47b61d2614921c6b9941b82d6 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Thu, 11 Aug 2022 10:46:06 +0800 Subject: [PATCH 149/164] materialize column instead of handling column in hash method --- src/Common/ColumnsHashing.h | 51 ++-------------- .../Transforms/IntersectOrExceptTransform.cpp | 9 ++- ...81_intersect_except_const_column.reference | 61 +++++++++++++++++++ .../02381_intersect_except_const_column.sql | 13 ++++ .../02381_intersect_hash_method.reference | 30 --------- .../02381_intersect_hash_method.sql | 3 - 6 files changed, 88 insertions(+), 79 deletions(-) create mode 100644 tests/queries/0_stateless/02381_intersect_except_const_column.reference create mode 100644 tests/queries/0_stateless/02381_intersect_except_const_column.sql delete mode 100644 tests/queries/0_stateless/02381_intersect_hash_method.reference delete mode 100644 tests/queries/0_stateless/02381_intersect_hash_method.sql diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h index 711c1c4096c..c3a087c0a6e 100644 --- a/src/Common/ColumnsHashing.h +++ b/src/Common/ColumnsHashing.h @@ -6,20 +6,15 @@ #include #include #include -#include -#include #include #include -#include #include #include #include -#include #include #include -#include namespace DB @@ -42,36 +37,16 @@ struct HashMethodOneNumber using Base = columns_hashing_impl::HashMethodBase; const char * vec; - FieldType const_value; - std::function get_key_holder_impl; /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise. HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { vec = key_columns[0]->getRawData().data; - if (isColumnConst(*key_columns[0])) - { - const_value = unalignedLoad(vec); - get_key_holder_impl = [this](size_t /*row*/) { return const_value; }; - } - else - { - get_key_holder_impl = [this](size_t row) { return unalignedLoad(vec + row * sizeof(FieldType)); }; - } } explicit HashMethodOneNumber(const IColumn * column) { vec = column->getRawData().data; - if (isColumnConst(*column)) - { - const_value = unalignedLoad(vec); - get_key_holder_impl = [this](size_t /*row*/) { return const_value; }; - } - else - { - get_key_holder_impl = [this](size_t row) { return unalignedLoad(vec + row * sizeof(FieldType)); }; - } } /// Creates context. Method is called once and result context is used in all threads. @@ -89,7 +64,7 @@ struct HashMethodOneNumber using Base::getHash; /// (const Data & data, size_t row, Arena & pool) -> size_t /// Is used for default implementation in HashMethodBase. - FieldType getKeyHolder(size_t row, Arena &) const { return get_key_holder_impl(row); } + FieldType getKeyHolder(size_t row, Arena &) const { return unalignedLoad(vec + row * sizeof(FieldType)); } const FieldType * getKeyData() const { return reinterpret_cast(vec); } }; @@ -105,28 +80,19 @@ struct HashMethodString const IColumn::Offset * offsets; const UInt8 * chars; - std::function get_key_holder_impl; HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { - const IColumn * column = key_columns[0]; - bool column_is_const = isColumnConst(*column); - if (column_is_const) - column = &assert_cast(*column).getDataColumn(); - - const ColumnString & column_string = assert_cast(*column); + const IColumn & column = *key_columns[0]; + const ColumnString & column_string = assert_cast(column); offsets = column_string.getOffsets().data(); chars = column_string.getChars().data(); - - if (column_is_const) - get_key_holder_impl = [this](size_t /*row*/) { return StringRef(chars, offsets[0] - 1); }; - else - get_key_holder_impl = [this](size_t row) { return StringRef(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1); }; } auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena & pool) const { - StringRef key = get_key_holder_impl(row); + StringRef key(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1); + if constexpr (place_string_to_arena) { return ArenaKeyHolder{key, pool}; @@ -153,7 +119,6 @@ struct HashMethodFixedString size_t n; const ColumnFixedString::Chars * chars; - std::function get_key_holder_impl; HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const HashMethodContextPtr &) { @@ -161,15 +126,11 @@ struct HashMethodFixedString const ColumnFixedString & column_string = assert_cast(column); n = column_string.getN(); chars = &column_string.getChars(); - if (isColumnConst(column)) - get_key_holder_impl = [this](size_t /*row*/) { return StringRef(&(*chars)[0], n); }; - else - get_key_holder_impl = [this](size_t row) { return StringRef(&(*chars)[row * n], n); }; } auto getKeyHolder(size_t row, [[maybe_unused]] Arena & pool) const { - StringRef key = get_key_holder_impl(row); + StringRef key(&(*chars)[row * n], n); if constexpr (place_string_to_arena) { diff --git a/src/Processors/Transforms/IntersectOrExceptTransform.cpp b/src/Processors/Transforms/IntersectOrExceptTransform.cpp index 3e39123ae4b..1ac82e99cf2 100644 --- a/src/Processors/Transforms/IntersectOrExceptTransform.cpp +++ b/src/Processors/Transforms/IntersectOrExceptTransform.cpp @@ -128,7 +128,11 @@ void IntersectOrExceptTransform::accumulate(Chunk chunk) column_ptrs.reserve(key_columns_pos.size()); for (auto pos : key_columns_pos) + { + /// Hash methods expect non-const column + columns[pos] = columns[pos]->convertToFullColumnIfConst(); column_ptrs.emplace_back(columns[pos].get()); + } if (!data) data.emplace(); @@ -160,8 +164,11 @@ void IntersectOrExceptTransform::filter(Chunk & chunk) column_ptrs.reserve(key_columns_pos.size()); for (auto pos : key_columns_pos) + { + /// Hash methods expect non-const column + columns[pos] = columns[pos]->convertToFullColumnIfConst(); column_ptrs.emplace_back(columns[pos].get()); - + } if (!data) data.emplace(); diff --git a/tests/queries/0_stateless/02381_intersect_except_const_column.reference b/tests/queries/0_stateless/02381_intersect_except_const_column.reference new file mode 100644 index 00000000000..290835b412e --- /dev/null +++ b/tests/queries/0_stateless/02381_intersect_except_const_column.reference @@ -0,0 +1,61 @@ +fooooo +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 diff --git a/tests/queries/0_stateless/02381_intersect_except_const_column.sql b/tests/queries/0_stateless/02381_intersect_except_const_column.sql new file mode 100644 index 00000000000..b10f913dd1e --- /dev/null +++ b/tests/queries/0_stateless/02381_intersect_except_const_column.sql @@ -0,0 +1,13 @@ +-- Test: crash the server +SELECT 'fooooo' INTERSECT SELECT 'fooooo'; +SELECT 'fooooo' EXCEPT SELECT 'fooooo'; + +-- Test: intersect return incorrect result for const column +SELECT 1 FROM numbers(10) INTERSECT SELECT 1 FROM numbers(10); +SELECT toString(1) FROM numbers(10) INTERSECT SELECT toString(1) FROM numbers(10); +SELECT '1' FROM numbers(10) INTERSECT SELECT '1' FROM numbers(10); + +-- Test: except return incorrect result for const column +SELECT 2 FROM numbers(10) EXCEPT SELECT 1 FROM numbers(5); +SELECT toString(2) FROM numbers(10) EXCEPT SELECT toString(1) FROM numbers(5); +SELECT '2' FROM numbers(10) EXCEPT SELECT '1' FROM numbers(5); \ No newline at end of file diff --git a/tests/queries/0_stateless/02381_intersect_hash_method.reference b/tests/queries/0_stateless/02381_intersect_hash_method.reference deleted file mode 100644 index ac8f48bbb7b..00000000000 --- a/tests/queries/0_stateless/02381_intersect_hash_method.reference +++ /dev/null @@ -1,30 +0,0 @@ -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 diff --git a/tests/queries/0_stateless/02381_intersect_hash_method.sql b/tests/queries/0_stateless/02381_intersect_hash_method.sql deleted file mode 100644 index 1154718c686..00000000000 --- a/tests/queries/0_stateless/02381_intersect_hash_method.sql +++ /dev/null @@ -1,3 +0,0 @@ -SELECT 1 FROM numbers(10) INTERSECT SELECT 1 FROM numbers(10); -SELECT toString(1) FROM numbers(10) INTERSECT SELECT toString(1) FROM numbers(10); -SELECT '1' FROM numbers(10) INTERSECT SELECT '1' FROM numbers(10); From 5337d21fb498f374fce8abb0cac9d2e0fe6b7bd7 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 11 Aug 2022 08:08:20 +0000 Subject: [PATCH 150/164] Mute test. --- tests/queries/0_stateless/00725_memory_tracking.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00725_memory_tracking.sql b/tests/queries/0_stateless/00725_memory_tracking.sql index b7356f0a6aa..ee81502ad83 100644 --- a/tests/queries/0_stateless/00725_memory_tracking.sql +++ b/tests/queries/0_stateless/00725_memory_tracking.sql @@ -1,4 +1,4 @@ --- Tags: no-replicated-database +-- Tags: no-replicated-database, no-tsan, no-asan, no-msan SELECT least(value, 0) FROM system.metrics WHERE metric = 'MemoryTracking'; SELECT length(range(100000000)); From aa42a42e0f5120d587e8f68bb27a1d1ddfcdfc08 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 11 Aug 2022 08:08:27 +0000 Subject: [PATCH 151/164] Fix documentation of "modulo(a, b)" Fixes #39287 ClickHouse uses the same semantics for modulo on floats as Python, i.e. 4.2 % 2.0 = 0.2 and not as previously documented: 4.2 % 2.0 --> (drop decimal places) --> 4 % 2 = 0. Fixed the documentation. --- docs/en/sql-reference/functions/arithmetic-functions.md | 6 +++--- docs/ru/sql-reference/functions/arithmetic-functions.md | 2 +- docs/zh/sql-reference/functions/arithmetic-functions.md | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/sql-reference/functions/arithmetic-functions.md b/docs/en/sql-reference/functions/arithmetic-functions.md index b8d2f171bc8..45df5f7f227 100644 --- a/docs/en/sql-reference/functions/arithmetic-functions.md +++ b/docs/en/sql-reference/functions/arithmetic-functions.md @@ -55,9 +55,9 @@ Differs from ‘intDiv’ in that it returns zero when dividing by zero or when ## modulo(a, b), a % b operator -Calculates the remainder after division. -If arguments are floating-point numbers, they are pre-converted to integers by dropping the decimal portion. -The remainder is taken in the same sense as in C++. Truncated division is used for negative numbers. +Calculates the remainder when dividing `a` by `b`. +The result type is an integer if both inputs are integers. If one of the inputs is a floating-point number, the result is a floating-point number. +The remainder is computed like in C++. Truncated division is used for negative numbers. An exception is thrown when dividing by zero or when dividing a minimal negative number by minus one. ## moduloOrZero(a, b) diff --git a/docs/ru/sql-reference/functions/arithmetic-functions.md b/docs/ru/sql-reference/functions/arithmetic-functions.md index c8f2e31cb0b..19af81e609d 100644 --- a/docs/ru/sql-reference/functions/arithmetic-functions.md +++ b/docs/ru/sql-reference/functions/arithmetic-functions.md @@ -56,7 +56,7 @@ SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 ## modulo(a, b), оператор a % b {#modulo} Вычисляет остаток от деления. -Если аргументы - числа с плавающей запятой, то они предварительно преобразуются в целые числа, путём отбрасывания дробной части. +Тип результата - целое число, если оба входа - целые числа. Если один из входов является числом с плавающей точкой, результатом будет число с плавающей точкой. Берётся остаток в том же смысле, как это делается в C++. По факту, для отрицательных чисел, используется truncated division. При делении на ноль или при делении минимального отрицательного числа на минус единицу, кидается исключение. diff --git a/docs/zh/sql-reference/functions/arithmetic-functions.md b/docs/zh/sql-reference/functions/arithmetic-functions.md index 15bec0d2107..acba761b619 100644 --- a/docs/zh/sql-reference/functions/arithmetic-functions.md +++ b/docs/zh/sql-reference/functions/arithmetic-functions.md @@ -54,7 +54,7 @@ SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 ## modulo(a, b), a % b operator {#modulo} 计算除法后的余数。 -如果参数是浮点数,则通过删除小数部分将它们预转换为整数。 +如果两个输入都是整数,结果类型是整数。如果其中一个输入是浮点数,则结果是浮点数。 其余部分与C++中的含义相同。截断除法用于负数。 除以零或将最小负数除以-1时抛出异常。 From 1f10c4be8cb6193693c0bf27cc84f8eee5b8191a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 11 Aug 2022 11:25:33 +0200 Subject: [PATCH 152/164] Update docs/ru/sql-reference/functions/arithmetic-functions.md Co-authored-by: Nikolay Degterinsky <43110995+evillique@users.noreply.github.com> --- docs/ru/sql-reference/functions/arithmetic-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/arithmetic-functions.md b/docs/ru/sql-reference/functions/arithmetic-functions.md index 19af81e609d..ba4340093b4 100644 --- a/docs/ru/sql-reference/functions/arithmetic-functions.md +++ b/docs/ru/sql-reference/functions/arithmetic-functions.md @@ -56,7 +56,7 @@ SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 ## modulo(a, b), оператор a % b {#modulo} Вычисляет остаток от деления. -Тип результата - целое число, если оба входа - целые числа. Если один из входов является числом с плавающей точкой, результатом будет число с плавающей точкой. +Тип результата - целое число, если оба аргумента - целые числа. Если один из аргументов является числом с плавающей точкой, результатом будет число с плавающей точкой. Берётся остаток в том же смысле, как это делается в C++. По факту, для отрицательных чисел, используется truncated division. При делении на ноль или при делении минимального отрицательного числа на минус единицу, кидается исключение. From 9f455329994fa93ba96be984a28f08befb436d28 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 11 Aug 2022 12:05:16 +0200 Subject: [PATCH 153/164] Use a job ID as ref text --- tests/ci/report.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci/report.py b/tests/ci/report.py index f4569f75b82..7d84185b863 100644 --- a/tests/ci/report.py +++ b/tests/ci/report.py @@ -239,8 +239,8 @@ def create_test_html_report( ) raw_log_name = os.path.basename(raw_log_url) - if raw_log_name.endswith("?check_suite_focus=true"): - raw_log_name = "Job (github actions)" + if "?" in raw_log_name: + raw_log_name = raw_log_name.split("?")[0] result = HTML_BASE_TEST_TEMPLATE.format( title=_format_header(header, branch_name), From 261ccc35cf2004c2f0603717a42a95e0ecbe8b2c Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Thu, 11 Aug 2022 12:18:44 +0200 Subject: [PATCH 154/164] Rename SettingAutoWrapper, add comment to readBinary Co-authored-by: Azat Khuzhin Co-authored-by: Maksim Kita --- src/Core/Settings.h | 2 +- src/Core/SettingsFields.cpp | 6 ++--- src/Core/SettingsFields.h | 24 ++++++++++++------- .../02381_setting_value_auto.reference | 8 +++---- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 727e45e3e50..f5108031dfb 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -212,7 +212,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) \ M(Bool, insert_deduplicate, true, "For INSERT queries in the replicated table, specifies that deduplication of insertings blocks should be performed", 0) \ \ - M(UInt64WithAuto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled.", 0) \ + M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled.", 0) \ M(Milliseconds, insert_quorum_timeout, 600000, "", 0) \ M(Bool, insert_quorum_parallel, true, "For quorum INSERT queries - enable to make parallel inserts without linearizability", 0) \ M(UInt64, select_sequential_consistency, 0, "For SELECT queries from the replicated table, throw an exception if the replica does not have a chunk written with the quorum; do not read the parts that have not yet been written with the quorum.", 0) \ diff --git a/src/Core/SettingsFields.cpp b/src/Core/SettingsFields.cpp index 86b20da9e8c..5b1b6b10cc2 100644 --- a/src/Core/SettingsFields.cpp +++ b/src/Core/SettingsFields.cpp @@ -153,9 +153,9 @@ template struct SettingFieldNumber; template struct SettingFieldNumber; template struct SettingFieldNumber; -template struct SettingWithAuto>; -template struct SettingWithAuto>; -template struct SettingWithAuto>; +template struct SettingAutoWrapper>; +template struct SettingAutoWrapper>; +template struct SettingAutoWrapper>; namespace { diff --git a/src/Core/SettingsFields.h b/src/Core/SettingsFields.h index 0fcc1d6783f..68c6e85796e 100644 --- a/src/Core/SettingsFields.h +++ b/src/Core/SettingsFields.h @@ -66,7 +66,7 @@ using SettingFieldBool = SettingFieldNumber; * but when serializing 'auto' old version will see binary representation of the default value. */ template -struct SettingWithAuto +struct SettingAutoWrapper { constexpr static auto keyword = "auto"; static bool isAuto(const Field & f) { return f.getType() == Field::Types::String && f.safeGet() == keyword; } @@ -78,17 +78,17 @@ struct SettingWithAuto bool is_auto = false; bool changed = false; - explicit SettingWithAuto() : is_auto(true) {} - explicit SettingWithAuto(Type val) : is_auto(false) { base = Base(val); } + explicit SettingAutoWrapper() : is_auto(true) {} + explicit SettingAutoWrapper(Type val) : is_auto(false) { base = Base(val); } - explicit SettingWithAuto(const Field & f) + explicit SettingAutoWrapper(const Field & f) : is_auto(isAuto(f)) { if (!is_auto) base = Base(f); } - SettingWithAuto & operator=(const Field & f) + SettingAutoWrapper & operator=(const Field & f) { changed = true; if (is_auto = isAuto(f); !is_auto) @@ -115,16 +115,22 @@ struct SettingWithAuto base.writeBinary(out); } + /* + * That it is fine to reset `is_auto` here and to use default value in case `is_auto` + * because settings will be serialized only if changed. + * If they were changed they were requested to use explicit value instead of `auto`. + * And so interactions between client-server, and server-server (distributed queries), should be OK. + */ void readBinary(ReadBuffer & in) { changed = true; is_auto = false; base.readBinary(in); } Type valueOr(Type default_value) const { return is_auto ? default_value : base.value; } }; -using SettingFieldUInt64WithAuto = SettingWithAuto; -using SettingFieldInt64WithAuto = SettingWithAuto; -using SettingFieldFloatWithAuto = SettingWithAuto; +using SettingFieldUInt64Auto = SettingAutoWrapper; +using SettingFieldInt64Auto = SettingAutoWrapper; +using SettingFieldFloatAuto = SettingAutoWrapper; -/* Similar to SettingFieldUInt64WithAuto with small differences to behave like regular UInt64, supported to compatibility. +/* Similar to SettingFieldUInt64Auto with small differences to behave like regular UInt64, supported to compatibility. * When setting to 'auto' it becomes equal to the number of processor cores without taking into account SMT. * A value of 0 is also treated as 'auto', so 'auto' is parsed and serialized in the same way as 0. */ diff --git a/tests/queries/0_stateless/02381_setting_value_auto.reference b/tests/queries/0_stateless/02381_setting_value_auto.reference index 72c87cf6f7d..acc5025da5e 100644 --- a/tests/queries/0_stateless/02381_setting_value_auto.reference +++ b/tests/queries/0_stateless/02381_setting_value_auto.reference @@ -1,4 +1,4 @@ -0 0 UInt64WithAuto -auto 1 UInt64WithAuto -0 1 UInt64WithAuto -1 1 UInt64WithAuto +0 0 UInt64Auto +auto 1 UInt64Auto +0 1 UInt64Auto +1 1 UInt64Auto From 61ad12279e20d58be2870df775b4197bb996c03f Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 11 Aug 2022 10:24:12 +0000 Subject: [PATCH 155/164] Delete files DictionaryJoinAdapter.h/cpp Follow-up for https://github.com/ClickHouse/ClickHouse/pull/38956 --- src/Interpreters/DictionaryJoinAdapter.cpp | 8 -------- src/Interpreters/DictionaryJoinAdapter.h | 7 ------- src/Interpreters/ExpressionAnalyzer.cpp | 1 - 3 files changed, 16 deletions(-) delete mode 100644 src/Interpreters/DictionaryJoinAdapter.cpp delete mode 100644 src/Interpreters/DictionaryJoinAdapter.h diff --git a/src/Interpreters/DictionaryJoinAdapter.cpp b/src/Interpreters/DictionaryJoinAdapter.cpp deleted file mode 100644 index bf0ad373204..00000000000 --- a/src/Interpreters/DictionaryJoinAdapter.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#include - - -namespace DB -{ - - -} diff --git a/src/Interpreters/DictionaryJoinAdapter.h b/src/Interpreters/DictionaryJoinAdapter.h deleted file mode 100644 index dade5da94e6..00000000000 --- a/src/Interpreters/DictionaryJoinAdapter.h +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once - - -namespace DB -{ - -} diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 0d4fc28c5ba..105d46eed1f 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include From d1051d822c02804292ac1e9486c104247e37418f Mon Sep 17 00:00:00 2001 From: Jianmei Zhang <66244986+zhangjmruc@users.noreply.github.com> Date: Thu, 11 Aug 2022 18:39:40 +0800 Subject: [PATCH 156/164] Use getSerializedFileExtension() to get correct file extension for index (#40095) --- src/Storages/MergeTree/MutateTask.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index d52948d71c1..63aabd20115 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -490,7 +490,8 @@ static NameSet collectFilesToSkip( for (const auto & index : indices_to_recalc) { - files_to_skip.insert(index->getFileName() + ".idx"); + /// Since MinMax index has .idx2 extension, we need to add correct extension. + files_to_skip.insert(index->getFileName() + index->getSerializedFileExtension()); files_to_skip.insert(index->getFileName() + mrk_extension); } From 96776d30287914a3bc0eba38dfdf2965c9bbec70 Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Thu, 11 Aug 2022 13:15:28 +0200 Subject: [PATCH 157/164] Trim trailing whitespaces in SettingsFields.h --- src/Core/SettingsFields.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsFields.h b/src/Core/SettingsFields.h index 68c6e85796e..f01ac37d3cc 100644 --- a/src/Core/SettingsFields.h +++ b/src/Core/SettingsFields.h @@ -115,7 +115,7 @@ struct SettingAutoWrapper base.writeBinary(out); } - /* + /* * That it is fine to reset `is_auto` here and to use default value in case `is_auto` * because settings will be serialized only if changed. * If they were changed they were requested to use explicit value instead of `auto`. From fa8fab2e8f15b446467393d66cdc4fa77216193a Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 28 Jul 2022 17:40:09 +0800 Subject: [PATCH 158/164] Fix KeyCondition with other filters --- src/Interpreters/InterpreterSelectQuery.cpp | 11 ++++-- src/Interpreters/InterpreterSelectQuery.h | 2 ++ .../optimizePrimaryKeyCondition.cpp | 35 ++++++++++++------- .../QueryPlan/ReadFromMergeTree.cpp | 28 ++++++++------- src/Processors/QueryPlan/ReadFromMergeTree.h | 17 +++++---- src/Storages/MergeTree/KeyCondition.cpp | 31 +++++++++++++--- src/Storages/MergeTree/KeyCondition.h | 16 +++++++-- src/Storages/MergeTree/MergeTreeData.cpp | 31 ++++++++-------- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 6 ++-- .../MergeTree/MergeTreeDataSelectExecutor.h | 3 +- .../MergeTree/MergeTreeIndexMinMax.cpp | 4 +-- src/Storages/MergeTree/PartitionPruner.h | 4 +-- src/Storages/SelectQueryInfo.h | 4 ++- src/Storages/StorageMerge.cpp | 2 +- src/Storages/StorageMerge.h | 8 +++-- ...10_projection_additional_filters.reference | 1 + .../01710_projection_additional_filters.sql | 9 +++++ .../01710_projection_row_policy.reference | 1 + .../01710_projection_row_policy.sql | 13 +++++++ 19 files changed, 153 insertions(+), 73 deletions(-) create mode 100644 tests/queries/0_stateless/01710_projection_additional_filters.reference create mode 100644 tests/queries/0_stateless/01710_projection_additional_filters.sql create mode 100644 tests/queries/0_stateless/01710_projection_row_policy.reference create mode 100644 tests/queries/0_stateless/01710_projection_row_policy.sql diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 1d009ec3f3b..205ec049975 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -608,11 +608,15 @@ InterpreterSelectQuery::InterpreterSelectQuery( if (storage) { + query_info.filter_asts.clear(); + /// Fix source_header for filter actions. if (row_policy_filter) { filter_info = generateFilterActions( table_id, row_policy_filter, context, storage, storage_snapshot, metadata_snapshot, required_columns); + + query_info.filter_asts.push_back(row_policy_filter); } if (query_info.additional_filter_ast) @@ -621,6 +625,8 @@ InterpreterSelectQuery::InterpreterSelectQuery( table_id, query_info.additional_filter_ast, context, storage, storage_snapshot, metadata_snapshot, required_columns); additional_filter_info->do_remove_column = true; + + query_info.filter_asts.push_back(query_info.additional_filter_ast); } source_header = storage_snapshot->getSampleBlockForColumns(required_columns); @@ -2002,8 +2008,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc && storage && storage->getName() != "MaterializedMySQL" && !storage->hasLightweightDeletedMask() - && !row_policy_filter - && !query_info.additional_filter_ast + && query_info.filter_asts.empty() && processing_stage == QueryProcessingStage::FetchColumns && query_analyzer->hasAggregation() && (query_analyzer->aggregates().size() == 1) @@ -2103,7 +2108,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc && !query.limit_with_ties && !query.prewhere() && !query.where() - && !query_info.additional_filter_ast + && query_info.filter_asts.empty() && !query.groupBy() && !query.having() && !query.orderBy() diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index f2cdcbba9ed..a94c9cb5462 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -127,6 +127,8 @@ public: /// It will set shard_num and shard_count to the client_info void setProperClientInfo(size_t replica_num, size_t replica_count); + FilterDAGInfoPtr getAdditionalQueryInfo() const { return additional_filter_info; } + static SortDescription getSortDescription(const ASTSelectQuery & query, const ContextPtr & context); static UInt64 getLimitForSorting(const ASTSelectQuery & query, const ContextPtr & context); diff --git a/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyCondition.cpp b/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyCondition.cpp index e559c23bbaf..7d682c408e5 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyCondition.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyCondition.cpp @@ -1,9 +1,10 @@ #include +#include #include #include #include #include -#include +#include namespace DB::QueryPlanOptimizations { @@ -16,33 +17,41 @@ void optimizePrimaryKeyCondition(QueryPlan::Node & root) size_t next_child = 0; }; - std::stack stack; - stack.push({.node = &root}); + std::deque stack; + stack.push_back({.node = &root}); while (!stack.empty()) { - auto & frame = stack.top(); + auto & frame = stack.back(); /// Traverse all children first. if (frame.next_child < frame.node->children.size()) { - stack.push({.node = frame.node->children[frame.next_child]}); + stack.push_back({.node = frame.node->children[frame.next_child]}); ++frame.next_child; continue; } - if (auto * filter_step = typeid_cast(frame.node->step.get())) + auto add_filter = [&](auto & storage) { - auto * child = frame.node->children.at(0); - if (auto * read_from_merge_tree = typeid_cast(child->step.get())) - read_from_merge_tree->addFilter(filter_step->getExpression(), filter_step->getFilterColumnName()); + for (auto iter=stack.rbegin() + 1; iter!=stack.rend(); ++iter) + { + if (auto * filter_step = typeid_cast(iter->node->step.get())) + storage.addFilter(filter_step->getExpression(), filter_step->getFilterColumnName()); + else if (typeid_cast(iter->node->step.get())) + ; + else + break; + } + }; - if (auto * read_from_merge = typeid_cast(child->step.get())) - read_from_merge->addFilter(filter_step->getExpression(), filter_step->getFilterColumnName()); - } + if (auto * read_from_merge_tree = typeid_cast(frame.node->step.get())) + add_filter(*read_from_merge_tree); + else if (auto * read_from_merge = typeid_cast(frame.node->step.get())) + add_filter(*read_from_merge); - stack.pop(); + stack.pop_back(); } } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 0d6f591b43a..14b06f9704b 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -835,8 +835,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(Merge return selectRangesToRead( std::move(parts), prewhere_info, - added_filter, - added_filter_column_name, + added_filter_nodes, storage_snapshot->metadata, storage_snapshot->getMetadataForQuery(), query_info, @@ -852,8 +851,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(Merge MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead( MergeTreeData::DataPartsVector parts, const PrewhereInfoPtr & prewhere_info, - const ActionsDAGPtr & added_filter, - const std::string & added_filter_column_name, + const ActionDAGNodes & added_filter_nodes, const StorageMetadataPtr & metadata_snapshot_base, const StorageMetadataPtr & metadata_snapshot, const SelectQueryInfo & query_info, @@ -895,17 +893,23 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead( ActionDAGNodes nodes; if (prewhere_info) { - const auto & node = prewhere_info->prewhere_actions->findInOutputs(prewhere_info->prewhere_column_name); - nodes.nodes.push_back(&node); + { + const auto & node = prewhere_info->prewhere_actions->findInOutputs(prewhere_info->prewhere_column_name); + nodes.nodes.push_back(&node); + } + + if (prewhere_info->row_level_filter) + { + const auto & node = prewhere_info->row_level_filter->findInOutputs(prewhere_info->row_level_column_name); + nodes.nodes.push_back(&node); + } } - if (added_filter) - { - const auto & node = added_filter->findInOutputs(added_filter_column_name); - nodes.nodes.push_back(&node); - } + for (const auto & node : added_filter_nodes.nodes) + nodes.nodes.push_back(node); - key_condition.emplace(std::move(nodes), query_info.syntax_analyzer_result, query_info.prepared_sets, context, primary_key_columns, primary_key.expression); + key_condition.emplace( + std::move(nodes), query_info.syntax_analyzer_result, query_info.prepared_sets, context, primary_key_columns, primary_key.expression); } else { diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index 1ba68b3fdb3..318f5a4b91f 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -116,8 +116,14 @@ public: void addFilter(ActionsDAGPtr expression, std::string column_name) { - added_filter = std::move(expression); - added_filter_column_name = std::move(column_name); + added_filter_dags.push_back(expression); + added_filter_nodes.nodes.push_back(&expression->findInOutputs(column_name)); + } + + void addFilterNodes(const ActionDAGNodes & filter_nodes) + { + for (const auto & node : filter_nodes.nodes) + added_filter_nodes.nodes.push_back(node); } StorageID getStorageID() const { return data.getStorageID(); } @@ -128,8 +134,7 @@ public: static MergeTreeDataSelectAnalysisResultPtr selectRangesToRead( MergeTreeData::DataPartsVector parts, const PrewhereInfoPtr & prewhere_info, - const ActionsDAGPtr & added_filter, - const std::string & added_filter_column_name, + const ActionDAGNodes & added_filter_nodes, const StorageMetadataPtr & metadata_snapshot_base, const StorageMetadataPtr & metadata_snapshot, const SelectQueryInfo & query_info, @@ -160,8 +165,8 @@ private: PrewhereInfoPtr prewhere_info; ExpressionActionsSettings actions_settings; - ActionsDAGPtr added_filter; - std::string added_filter_column_name; + std::vector added_filter_dags; + ActionDAGNodes added_filter_nodes; StorageSnapshotPtr storage_snapshot; StorageMetadataPtr metadata_for_reading; diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 7128558b734..b42fe49a1d0 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -854,6 +854,7 @@ static NameSet getAllSubexpressionNames(const ExpressionActions & key_expr) KeyCondition::KeyCondition( const ASTPtr & query, + const ASTs & additional_filter_asts, TreeRewriterResultPtr syntax_analyzer_result, PreparedSetsPtr prepared_sets_, ContextPtr context, @@ -883,13 +884,35 @@ KeyCondition::KeyCondition( array_joined_columns.insert(name); const ASTSelectQuery & select = query->as(); - if (select.where() || select.prewhere()) + + ASTs filters; + if (select.where()) + filters.push_back(select.where()); + + if (select.prewhere()) + filters.push_back(select.prewhere()); + + for (const auto & filter_ast : additional_filter_asts) + filters.push_back(filter_ast); + + if (!filters.empty()) { ASTPtr filter_query; - if (select.where() && select.prewhere()) - filter_query = makeASTFunction("and", select.where(), select.prewhere()); + if (filters.size() == 1) + { + filter_query = filters.front(); + } else - filter_query = select.where() ? select.where() : select.prewhere(); + { + auto function = std::make_shared(); + + function->name = "and"; + function->arguments = std::make_shared(); + function->children.push_back(function->arguments); + function->arguments->children = std::move(filters); + + filter_query = function; + } /** When non-strictly monotonic functions are employed in functional index (e.g. ORDER BY toStartOfHour(dateTime)), * the use of NOT operator in predicate will result in the indexing algorithm leave out some data. diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h index 3c2089a56d7..586bc43f791 100644 --- a/src/Storages/MergeTree/KeyCondition.h +++ b/src/Storages/MergeTree/KeyCondition.h @@ -208,6 +208,7 @@ public: /// Does not take into account the SAMPLE section. all_columns - the set of all columns of the table. KeyCondition( const ASTPtr & query, + const ASTs & additional_filter_asts, TreeRewriterResultPtr syntax_analyzer_result, PreparedSetsPtr prepared_sets_, ContextPtr context, @@ -223,9 +224,18 @@ public: const ExpressionActionsPtr & key_expr_, bool single_point_ = false, bool strict_ = false) - : KeyCondition(query_info.query, query_info.syntax_analyzer_result, query_info.prepared_sets, - context, key_column_names, key_expr_, single_point_, strict_) - {} + : KeyCondition( + query_info.query, + query_info.filter_asts, + query_info.syntax_analyzer_result, + query_info.prepared_sets, + context, + key_column_names, + key_expr_, + single_point_, + strict_) + { + } KeyCondition( ActionDAGNodes dag_nodes, diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 594b4a32f9c..63705fbcdf5 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5142,8 +5142,7 @@ static void selectBestProjection( const MergeTreeDataSelectExecutor & reader, const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, - const ActionsDAGPtr & added_filter, - const std::string & added_filter_column_name, + const ActionDAGNodes & added_filter_nodes, const Names & required_columns, ProjectionCandidate & candidate, ContextPtr query_context, @@ -5174,8 +5173,7 @@ static void selectBestProjection( storage_snapshot->metadata, candidate.desc->metadata, query_info, - added_filter, - added_filter_column_name, + added_filter_nodes, query_context, settings.max_threads, max_added_blocks); @@ -5198,8 +5196,7 @@ static void selectBestProjection( storage_snapshot->metadata, storage_snapshot->metadata, query_info, // TODO syntax_analysis_result set in index - added_filter, - added_filter_column_name, + added_filter_nodes, query_context, settings.max_threads, max_added_blocks); @@ -5524,6 +5521,14 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg const auto & before_where = analysis_result.before_where; const auto & where_column_name = analysis_result.where_column_name; + /// For PK analysis + ActionDAGNodes added_filter_nodes; + if (auto additional_filter_info = select.getAdditionalQueryInfo()) + added_filter_nodes.nodes.push_back(&additional_filter_info->actions->findInOutputs(additional_filter_info->column_name)); + + if (before_where) + added_filter_nodes.nodes.push_back(&before_where->findInOutputs(where_column_name)); + bool can_use_aggregate_projection = true; /// If the first stage of the query pipeline is more complex than Aggregating - Expression - Filter - ReadFromStorage, /// we cannot use aggregate projection. @@ -5750,7 +5755,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg query_info.minmax_count_projection_block = getMinMaxCountProjectionBlock( metadata_snapshot, minmax_count_projection_candidate->required_columns, - analysis_result.prewhere_info || analysis_result.before_where, + !query_info.filter_asts.empty() || analysis_result.prewhere_info || analysis_result.before_where, query_info, parts, normal_parts, @@ -5792,8 +5797,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg metadata_snapshot, metadata_snapshot, query_info, - before_where, - where_column_name, + added_filter_nodes, query_context, settings.max_threads, max_added_blocks); @@ -5825,8 +5829,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg metadata_snapshot, metadata_snapshot, query_info, - before_where, - where_column_name, + added_filter_nodes, query_context, settings.max_threads, max_added_blocks); @@ -5852,8 +5855,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg reader, storage_snapshot, query_info, - before_where, - where_column_name, + added_filter_nodes, analysis_result.required_columns, candidate, query_context, @@ -5874,8 +5876,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg reader, storage_snapshot, query_info, - before_where, - where_column_name, + added_filter_nodes, analysis_result.required_columns, candidate, query_context, diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index ba3505b5886..c5f546a9c36 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1273,8 +1273,7 @@ MergeTreeDataSelectAnalysisResultPtr MergeTreeDataSelectExecutor::estimateNumMar const StorageMetadataPtr & metadata_snapshot_base, const StorageMetadataPtr & metadata_snapshot, const SelectQueryInfo & query_info, - const ActionsDAGPtr & added_filter, - const std::string & added_filter_column_name, + const ActionDAGNodes & added_filter_nodes, ContextPtr context, unsigned num_streams, std::shared_ptr max_block_numbers_to_read) const @@ -1295,8 +1294,7 @@ MergeTreeDataSelectAnalysisResultPtr MergeTreeDataSelectExecutor::estimateNumMar return ReadFromMergeTree::selectRangesToRead( std::move(parts), query_info.prewhere_info, - added_filter, - added_filter_column_name, + added_filter_nodes, metadata_snapshot_base, metadata_snapshot, query_info, diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h index 899cf1f2862..bb44f260eec 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h @@ -60,8 +60,7 @@ public: const StorageMetadataPtr & metadata_snapshot_base, const StorageMetadataPtr & metadata_snapshot, const SelectQueryInfo & query_info, - const ActionsDAGPtr & added_filter, - const std::string & added_filter_column_name, + const ActionDAGNodes & added_filter_nodes, ContextPtr context, unsigned num_streams, std::shared_ptr max_block_numbers_to_read = nullptr) const; diff --git a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp index 05319ecc62e..b190ac2b2fd 100644 --- a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp @@ -157,9 +157,7 @@ void MergeTreeIndexAggregatorMinMax::update(const Block & block, size_t * pos, s MergeTreeIndexConditionMinMax::MergeTreeIndexConditionMinMax( - const IndexDescription & index, - const SelectQueryInfo & query, - ContextPtr context) + const IndexDescription & index, const SelectQueryInfo & query, ContextPtr context) : index_data_types(index.data_types) , condition(query, context, index.column_names, index.expression) { diff --git a/src/Storages/MergeTree/PartitionPruner.h b/src/Storages/MergeTree/PartitionPruner.h index 675fef1433d..9953c52b593 100644 --- a/src/Storages/MergeTree/PartitionPruner.h +++ b/src/Storages/MergeTree/PartitionPruner.h @@ -26,9 +26,7 @@ private: public: PartitionPruner(const StorageMetadataPtr & metadata, const SelectQueryInfo & query_info, ContextPtr context, bool strict) : partition_key(MergeTreePartition::adjustPartitionKey(metadata, context)) - , partition_condition( - query_info.query, query_info.syntax_analyzer_result, query_info.prepared_sets, - context, partition_key.column_names, partition_key.expression, true /* single_point */, strict) + , partition_condition(query_info, context, partition_key.column_names, partition_key.expression, true /* single_point */, strict) , useless(strict ? partition_condition.anyUnknownOrAlwaysTrue() : partition_condition.alwaysUnknownOrTrue()) { } diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 4a3db2e8497..909da5bebba 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -156,9 +156,11 @@ struct SelectQueryInfo TreeRewriterResultPtr syntax_analyzer_result; /// This is an additional filer applied to current table. - /// It is needed only for additional PK filtering. ASTPtr additional_filter_ast; + /// It is needed for PK analysis based on row_level_policy and additional_filters. + ASTs filter_asts; + ReadInOrderOptimizerPtr order_optimizer; /// Can be modified while reading from storage InputOrderInfoPtr input_order_info; diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 666717e50a0..2bedf406b7d 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -542,7 +542,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( return {}; if (auto * read_from_merge_tree = typeid_cast(plan.getRootNode()->step.get())) - read_from_merge_tree->addFilter(added_filter, added_filter_column_name); + read_from_merge_tree->addFilterNodes(added_filter_nodes); builder = plan.buildQueryPipeline( QueryPlanOptimizationSettings::fromContext(modified_context), diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index d2f94ac6b88..6bf68660803 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -140,8 +140,8 @@ public: void addFilter(ActionsDAGPtr expression, std::string column_name) { - added_filter = std::move(expression); - added_filter_column_name = std::move(column_name); + added_filter_dags.push_back(expression); + added_filter_nodes.nodes.push_back(&expression->findInOutputs(column_name)); } private: @@ -160,7 +160,9 @@ private: ContextMutablePtr context; QueryProcessingStage::Enum common_processed_stage; - ActionsDAGPtr added_filter; + std::vector added_filter_dags; + ActionDAGNodes added_filter_nodes; + std::string added_filter_column_name; struct AliasData diff --git a/tests/queries/0_stateless/01710_projection_additional_filters.reference b/tests/queries/0_stateless/01710_projection_additional_filters.reference new file mode 100644 index 00000000000..06b63ea6c2f --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_additional_filters.reference @@ -0,0 +1 @@ +0 0 0 diff --git a/tests/queries/0_stateless/01710_projection_additional_filters.sql b/tests/queries/0_stateless/01710_projection_additional_filters.sql new file mode 100644 index 00000000000..1633b48ba7e --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_additional_filters.sql @@ -0,0 +1,9 @@ +DROP TABLE IF EXISTS t; + +CREATE TABLE t(a UInt32, b UInt32) ENGINE = MergeTree PARTITION BY a ORDER BY a; + +INSERT INTO t SELECT number % 10, number FROM numbers(10000); + +SELECT count(), min(a), max(a) FROM t SETTINGS additional_table_filters = {'t' : '0'}; + +DROP TABLE t; diff --git a/tests/queries/0_stateless/01710_projection_row_policy.reference b/tests/queries/0_stateless/01710_projection_row_policy.reference new file mode 100644 index 00000000000..06b63ea6c2f --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_row_policy.reference @@ -0,0 +1 @@ +0 0 0 diff --git a/tests/queries/0_stateless/01710_projection_row_policy.sql b/tests/queries/0_stateless/01710_projection_row_policy.sql new file mode 100644 index 00000000000..a54cc50b9e9 --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_row_policy.sql @@ -0,0 +1,13 @@ +DROP TABLE IF EXISTS t; + +CREATE TABLE t(a UInt32, b UInt32) ENGINE = MergeTree PARTITION BY a ORDER BY a; + +INSERT INTO t SELECT number % 10, number FROM numbers(10000); + +CREATE ROW POLICY OR REPLACE rp ON t FOR SELECT USING 0 TO ALL; + +SELECT count(), min(a), max(a) FROM t; + +DROP ROW POLICY rp ON t; + +DROP TABLE t; From 11a274e9901f475bff7459356eb9c2820a683a57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 11 Aug 2022 13:27:53 +0200 Subject: [PATCH 159/164] Clean up constinit usage and add a comment --- cmake/warnings.cmake | 1 + src/Common/ThreadStatus.cpp | 8 -------- src/Common/ThreadStatus.h | 16 +++++++++------- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/cmake/warnings.cmake b/cmake/warnings.cmake index 994f14c6149..a8f12fe26dd 100644 --- a/cmake/warnings.cmake +++ b/cmake/warnings.cmake @@ -23,6 +23,7 @@ if (COMPILER_CLANG) no_warning(zero-length-array) no_warning(c++98-compat-pedantic) no_warning(c++98-compat) + no_warning(c++20-compat) # Use constinit in C++20 without warnings no_warning(conversion) no_warning(ctad-maybe-unsupported) # clang 9+, linux-only no_warning(disabled-macro-expansion) diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index 98f78cada5c..423a44c97d6 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -24,15 +24,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wc++20-compat" -#endif thread_local ThreadStatus constinit * current_thread = nullptr; -thread_local ThreadStatus * main_thread = nullptr; -#ifdef __clang__ -#pragma clang diagnostic pop -#endif #if !defined(SANITIZER) namespace diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 594e86ffa2e..0b01f43a226 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -102,14 +102,16 @@ public: using ThreadGroupStatusPtr = std::shared_ptr; -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wc++20-compat" -#endif +/** + * We use **constinit** here to tell the compiler the current_thread variable is initialized. + * If we didn't help the compiler, then it would most likely add a check before every use of the variable to initialize it if needed. + * Instead it will trust that we are doing the right thing (and we do initialize it to nullptr) and emit more optimal code. + * This is noticeable in functions like CurrentMemoryTracker::free and CurrentMemoryTracker::allocImpl + * See also: + * - https://en.cppreference.com/w/cpp/language/constinit + * - https://github.com/ClickHouse/ClickHouse/pull/40078 + */ extern thread_local constinit ThreadStatus * current_thread; -#ifdef __clang__ -#pragma clang diagnostic pop -#endif /** Encapsulates all per-thread info (ProfileEvents, MemoryTracker, query_id, query context, etc.). * The object must be created in thread function and destroyed in the same thread before the exit. From cad311565c94d6799d4d854655a7ce470169379f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 11 Aug 2022 21:57:04 +0300 Subject: [PATCH 160/164] Update 02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.sh --- .../02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.sh b/tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.sh index 43f6d62bd10..65025858e20 100755 --- a/tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.sh +++ b/tests/queries/0_stateless/02390_prometheus_ClickHouseStatusInfo_DictionaryStatus.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-ordinary-database CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 687ea5dd762799db9c742dbf13a5333c39809f07 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Thu, 11 Aug 2022 22:13:22 +0200 Subject: [PATCH 161/164] Update FileCache.cpp --- src/Common/FileCache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index 47b7d57ae66..ca826a6e359 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -1064,7 +1064,7 @@ void FileCache::loadCacheInfoIntoMemory(std::lock_guard & cache_lock if (!parsed) { - LOG_WARNING(log, "Unexpected file: ", offset_it->path().string()); + LOG_WARNING(log, "Unexpected file: {}", offset_it->path().string()); continue; /// Or just remove? Some unexpected file. } From 004a4d4947d53ac49d221d302004e913e06af091 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Thu, 11 Aug 2022 22:15:33 +0200 Subject: [PATCH 162/164] Update FileCache.cpp --- src/Common/FileCache.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Common/FileCache.cpp b/src/Common/FileCache.cpp index ca826a6e359..5a59bb6182a 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/FileCache.cpp @@ -1044,8 +1044,13 @@ void FileCache::loadCacheInfoIntoMemory(std::lock_guard & cache_lock fs::directory_iterator key_it{key_prefix_it->path()}; for (; key_it != fs::directory_iterator(); ++key_it) { - key = Key(unhexUInt(key_it->path().filename().string().data())); + if (!key_it->is_directory()) + { + LOG_WARNING(log, "Unexpected file: {}. Expected a directory", key_it->path().string()); + continue; + } + key = Key(unhexUInt(key_it->path().filename().string().data())); fs::directory_iterator offset_it{key_it->path()}; for (; offset_it != fs::directory_iterator(); ++offset_it) { From 55ff5463227f29e669cfcbd7766d0afffdb508cf Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 11 Aug 2022 20:42:59 +0000 Subject: [PATCH 163/164] Fix typo --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- docs/zh/sql-reference/functions/type-conversion-functions.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 3612de7a0d4..ecdf34bf7ee 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1241,7 +1241,7 @@ Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that ## toLowCardinality -Converts input parameter to the [LowCardianlity](../../sql-reference/data-types/lowcardinality.md) version of same data type. +Converts input parameter to the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) version of same data type. To convert data from the `LowCardinality` data type use the [CAST](#type_conversion_function-cast) function. For example, `CAST(x as String)`. diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 679aa00073e..7635bda78e6 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -1162,7 +1162,7 @@ FORMAT PrettyCompactMonoBlock; ## toLowCardinality {#tolowcardinality} -Преобразует входные данные в версию [LowCardianlity](../data-types/lowcardinality.md) того же типа данных. +Преобразует входные данные в версию [LowCardinality](../data-types/lowcardinality.md) того же типа данных. Чтобы преобразовать данные из типа `LowCardinality`, используйте функцию [CAST](#type_conversion_function-cast). Например, `CAST(x as String)`. diff --git a/docs/zh/sql-reference/functions/type-conversion-functions.md b/docs/zh/sql-reference/functions/type-conversion-functions.md index b72dc438e0d..d2330df6cb1 100644 --- a/docs/zh/sql-reference/functions/type-conversion-functions.md +++ b/docs/zh/sql-reference/functions/type-conversion-functions.md @@ -512,7 +512,7 @@ SELECT parseDateTimeBestEffort('10 20:19') ## toLowCardinality {#tolowcardinality} -把输入值转换为[LowCardianlity](../data-types/lowcardinality.md)的相同类型的数据。 +把输入值转换为[LowCardinality](../data-types/lowcardinality.md)的相同类型的数据。 如果要把`LowCardinality`类型的数据转换为其他类型,使用[CAST](#type_conversion_function-cast)函数。比如:`CAST(x as String)`。 From 17956cb668e2af2f9f19b5618820e9e5c843badd Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 12 Aug 2022 14:28:35 +0200 Subject: [PATCH 164/164] Extend protocol with query parameters (#39906) --- programs/client/Client.cpp | 1 + src/Client/ClientBase.cpp | 9 ++- src/Client/Connection.cpp | 9 +++ src/Client/Connection.h | 1 + src/Client/HedgedConnections.cpp | 2 +- src/Client/IServerConnection.h | 1 + src/Client/LocalConnection.cpp | 4 + src/Client/LocalConnection.h | 1 + src/Client/MultiplexedConnections.cpp | 8 +- src/Client/Suggest.cpp | 3 +- src/Core/ProtocolDefines.h | 4 +- src/Interpreters/Context.cpp | 5 ++ src/Interpreters/Context.h | 5 ++ src/Interpreters/InterpreterSetQuery.cpp | 6 +- src/Parsers/ASTSetQuery.h | 5 +- src/Parsers/ParserSetQuery.cpp | 43 ++++++++-- src/Parsers/ParserSetQuery.h | 2 + src/QueryPipeline/RemoteInserter.cpp | 3 +- src/Server/HTTPHandler.cpp | 5 +- src/Server/TCPHandler.cpp | 27 +++++++ ...0955_complex_prepared_statements.reference | 1 + ...d_protocol_with_query_parameters.reference | 9 +++ ...7_extend_protocol_with_query_parameters.sh | 80 +++++++++++++++++++ 23 files changed, 213 insertions(+), 21 deletions(-) create mode 100644 tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.reference create mode 100755 tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.sh diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 584806951cf..9b1dbbe221a 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -133,6 +133,7 @@ std::vector Client::loadWarningMessages() std::vector messages; connection->sendQuery(connection_parameters.timeouts, "SELECT * FROM viewIfPermitted(SELECT message FROM system.warnings ELSE null('message String'))", + {} /* query_parameters */, "" /* query_id */, QueryProcessingStage::Complete, &global_context->getSettingsRef(), diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 977d2bca01f..468c49f22b7 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -740,8 +740,10 @@ void ClientBase::processOrdinaryQuery(const String & query_to_execute, ASTPtr pa /// Rewrite query only when we have query parameters. /// Note that if query is rewritten, comments in query are lost. /// But the user often wants to see comments in server logs, query log, processlist, etc. + /// For recent versions of the server query parameters will be transferred by network and applied on the server side. auto query = query_to_execute; - if (!query_parameters.empty()) + if (!query_parameters.empty() + && connection->getServerRevision(connection_parameters.timeouts) < DBMS_MIN_PROTOCOL_VERSION_WITH_PARAMETERS) { /// Replace ASTQueryParameter with ASTLiteral for prepared statements. ReplaceQueryParameterVisitor visitor(query_parameters); @@ -762,6 +764,7 @@ void ClientBase::processOrdinaryQuery(const String & query_to_execute, ASTPtr pa connection->sendQuery( connection_parameters.timeouts, query, + query_parameters, global_context->getCurrentQueryId(), query_processing_stage, &global_context->getSettingsRef(), @@ -1087,7 +1090,8 @@ bool ClientBase::receiveSampleBlock(Block & out, ColumnsDescription & columns_de void ClientBase::processInsertQuery(const String & query_to_execute, ASTPtr parsed_query) { auto query = query_to_execute; - if (!query_parameters.empty()) + if (!query_parameters.empty() + && connection->getServerRevision(connection_parameters.timeouts) < DBMS_MIN_PROTOCOL_VERSION_WITH_PARAMETERS) { /// Replace ASTQueryParameter with ASTLiteral for prepared statements. ReplaceQueryParameterVisitor visitor(query_parameters); @@ -1114,6 +1118,7 @@ void ClientBase::processInsertQuery(const String & query_to_execute, ASTPtr pars connection->sendQuery( connection_parameters.timeouts, query, + query_parameters, global_context->getCurrentQueryId(), query_processing_stage, &global_context->getSettingsRef(), diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index bbd4c380831..cd2b24a7c76 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -477,6 +477,7 @@ TablesStatusResponse Connection::getTablesStatus(const ConnectionTimeouts & time void Connection::sendQuery( const ConnectionTimeouts & timeouts, const String & query, + const NameToNameMap & query_parameters, const String & query_id_, UInt64 stage, const Settings * settings, @@ -569,6 +570,14 @@ void Connection::sendQuery( writeStringBinary(query, *out); + if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_PARAMETERS) + { + Settings params; + for (const auto & [name, value] : query_parameters) + params.set(name, value); + params.write(*out, SettingsWriteFormat::STRINGS_WITH_FLAGS); + } + maybe_compressed_in.reset(); maybe_compressed_out.reset(); block_in.reset(); diff --git a/src/Client/Connection.h b/src/Client/Connection.h index c712fd730dd..8d839c62754 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -97,6 +97,7 @@ public: void sendQuery( const ConnectionTimeouts & timeouts, const String & query, + const NameToNameMap& query_parameters, const String & query_id_/* = "" */, UInt64 stage/* = QueryProcessingStage::Complete */, const Settings * settings/* = nullptr */, diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 9f0ead79981..f1802467b57 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -183,7 +183,7 @@ void HedgedConnections::sendQuery( modified_settings.parallel_replica_offset = fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset; } - replica.connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data, {}); + replica.connection->sendQuery(timeouts, query, /* query_parameters */ {}, query_id, stage, &modified_settings, &client_info, with_pending_data, {}); replica.change_replica_timeout.setRelative(timeouts.receive_data_timeout); replica.packet_receiver->setReceiveTimeout(hedged_connections_factory.getConnectionTimeouts().receive_timeout); }; diff --git a/src/Client/IServerConnection.h b/src/Client/IServerConnection.h index 542aecb9849..96cf1f119ba 100644 --- a/src/Client/IServerConnection.h +++ b/src/Client/IServerConnection.h @@ -86,6 +86,7 @@ public: virtual void sendQuery( const ConnectionTimeouts & timeouts, const String & query, + const NameToNameMap & query_parameters, const String & query_id_, UInt64 stage, const Settings * settings, diff --git a/src/Client/LocalConnection.cpp b/src/Client/LocalConnection.cpp index 425e54fb392..b10e24f1ae4 100644 --- a/src/Client/LocalConnection.cpp +++ b/src/Client/LocalConnection.cpp @@ -75,6 +75,7 @@ void LocalConnection::sendProfileEvents() void LocalConnection::sendQuery( const ConnectionTimeouts &, const String & query, + const NameToNameMap & query_parameters, const String & query_id, UInt64 stage, const Settings *, @@ -82,6 +83,9 @@ void LocalConnection::sendQuery( bool, std::function process_progress_callback) { + if (!query_parameters.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "clickhouse local does not support query parameters"); + /// Suggestion comes without client_info. if (client_info) query_context = session.makeQueryContext(*client_info); diff --git a/src/Client/LocalConnection.h b/src/Client/LocalConnection.h index 1ebe4a1d901..dbdd3c127cb 100644 --- a/src/Client/LocalConnection.h +++ b/src/Client/LocalConnection.h @@ -94,6 +94,7 @@ public: void sendQuery( const ConnectionTimeouts & timeouts, const String & query, + const NameToNameMap & query_parameters, const String & query_id/* = "" */, UInt64 stage/* = QueryProcessingStage::Complete */, const Settings * settings/* = nullptr */, diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index b14ff9f2c8d..72cd4c46477 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -160,15 +160,15 @@ void MultiplexedConnections::sendQuery( if (enable_sample_offset_parallel_processing) modified_settings.parallel_replica_offset = i; - replica_states[i].connection->sendQuery(timeouts, query, query_id, - stage, &modified_settings, &client_info, with_pending_data, {}); + replica_states[i].connection->sendQuery( + timeouts, query, /* query_parameters */ {}, query_id, stage, &modified_settings, &client_info, with_pending_data, {}); } } else { /// Use single replica. - replica_states[0].connection->sendQuery(timeouts, query, query_id, - stage, &modified_settings, &client_info, with_pending_data, {}); + replica_states[0].connection->sendQuery( + timeouts, query, /* query_parameters */ {}, query_id, stage, &modified_settings, &client_info, with_pending_data, {}); } sent_query = true; diff --git a/src/Client/Suggest.cpp b/src/Client/Suggest.cpp index 1074adb2bd4..f8d41853566 100644 --- a/src/Client/Suggest.cpp +++ b/src/Client/Suggest.cpp @@ -138,7 +138,8 @@ void Suggest::load(ContextPtr context, const ConnectionParameters & connection_p void Suggest::fetch(IServerConnection & connection, const ConnectionTimeouts & timeouts, const std::string & query) { - connection.sendQuery(timeouts, query, "" /* query_id */, QueryProcessingStage::Complete, nullptr, nullptr, false, {}); + connection.sendQuery( + timeouts, query, {} /* query_parameters */, "" /* query_id */, QueryProcessingStage::Complete, nullptr, nullptr, false, {}); while (true) { diff --git a/src/Core/ProtocolDefines.h b/src/Core/ProtocolDefines.h index cf0a9d8b887..78585492c8e 100644 --- a/src/Core/ProtocolDefines.h +++ b/src/Core/ProtocolDefines.h @@ -52,7 +52,7 @@ /// NOTE: DBMS_TCP_PROTOCOL_VERSION has nothing common with VERSION_REVISION, /// later is just a number for server version (one number instead of commit SHA) /// for simplicity (sometimes it may be more convenient in some use cases). -#define DBMS_TCP_PROTOCOL_VERSION 54458 +#define DBMS_TCP_PROTOCOL_VERSION 54459 #define DBMS_MIN_PROTOCOL_VERSION_WITH_INITIAL_QUERY_START_TIME 54449 @@ -63,3 +63,5 @@ #define DBMS_MIN_PROTOCOL_VERSION_WITH_ADDENDUM 54458 #define DBMS_MIN_PROTOCOL_VERSION_WITH_QUOTA_KEY 54458 + +#define DBMS_MIN_PROTOCOL_VERSION_WITH_PARAMETERS 54459 diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index a5629b33d22..f03306b5426 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2965,6 +2965,11 @@ void Context::setQueryParameter(const String & name, const String & value) throw Exception("Duplicate name " + backQuote(name) + " of query parameter", ErrorCodes::BAD_ARGUMENTS); } +void Context::addQueryParameters(const NameToNameMap & parameters) +{ + for (const auto & [name, value] : parameters) + query_parameters.insert_or_assign(name, value); +} void Context::addBridgeCommand(std::unique_ptr cmd) const { diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index cf508c7bfdb..9afbae46ce1 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -946,9 +946,14 @@ public: /// Query parameters for prepared statements. bool hasQueryParameters() const; const NameToNameMap & getQueryParameters() const; + + /// Throws if parameter with the given name already set. void setQueryParameter(const String & name, const String & value); void setQueryParameters(const NameToNameMap & parameters) { query_parameters = parameters; } + /// Overrides values of existing parameters. + void addQueryParameters(const NameToNameMap & parameters); + /// Add started bridge command. It will be killed after context destruction void addBridgeCommand(std::unique_ptr cmd) const; diff --git a/src/Interpreters/InterpreterSetQuery.cpp b/src/Interpreters/InterpreterSetQuery.cpp index 1c6a4236bf6..2bd8d648040 100644 --- a/src/Interpreters/InterpreterSetQuery.cpp +++ b/src/Interpreters/InterpreterSetQuery.cpp @@ -1,6 +1,6 @@ -#include #include #include +#include namespace DB { @@ -10,7 +10,9 @@ BlockIO InterpreterSetQuery::execute() { const auto & ast = query_ptr->as(); getContext()->checkSettingsConstraints(ast.changes); - getContext()->getSessionContext()->applySettingsChanges(ast.changes); + auto session_context = getContext()->getSessionContext(); + session_context->applySettingsChanges(ast.changes); + session_context->addQueryParameters(ast.query_parameters); return {}; } diff --git a/src/Parsers/ASTSetQuery.h b/src/Parsers/ASTSetQuery.h index 40a0b679650..4e3d9d227b6 100644 --- a/src/Parsers/ASTSetQuery.h +++ b/src/Parsers/ASTSetQuery.h @@ -1,8 +1,8 @@ #pragma once -#include +#include #include - +#include namespace DB { @@ -15,6 +15,7 @@ public: bool is_standalone = true; /// If false, this AST is a part of another query, such as SELECT. SettingsChanges changes; + NameToNameMap query_parameters; /** Get the text that identifies this element. */ String getID(char) const override { return "Set"; } diff --git a/src/Parsers/ParserSetQuery.cpp b/src/Parsers/ParserSetQuery.cpp index 0ff437bcfb1..20de785ac1b 100644 --- a/src/Parsers/ParserSetQuery.cpp +++ b/src/Parsers/ParserSetQuery.cpp @@ -5,13 +5,38 @@ #include #include -#include +#include +#include +#include +#include #include - +#include namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +static NameToNameMap::value_type convertToQueryParameter(SettingChange change) +{ + auto name = change.name.substr(strlen(QUERY_PARAMETER_NAME_PREFIX)); + if (name.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter name cannot be empty"); + + auto value = applyVisitor(FieldVisitorToString(), change.value); + /// writeQuoted is not always quoted in line with SQL standard https://github.com/ClickHouse/ClickHouse/blob/master/src/IO/WriteHelpers.h + if (value.starts_with('\'')) + { + ReadBufferFromOwnString buf(value); + readQuoted(value, buf); + } + return {name, value}; +} + + class ParserLiteralOrMap : public IParserBase { public: @@ -111,16 +136,23 @@ bool ParserSetQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) } SettingsChanges changes; + NameToNameMap query_parameters; while (true) { - if (!changes.empty() && !s_comma.ignore(pos)) + if ((!changes.empty() || !query_parameters.empty()) && !s_comma.ignore(pos)) break; - changes.push_back(SettingChange{}); + /// Either a setting or a parameter for prepared statement (if name starts with QUERY_PARAMETER_NAME_PREFIX) + SettingChange current; - if (!parseNameValuePair(changes.back(), pos, expected)) + if (!parseNameValuePair(current, pos, expected)) return false; + + if (current.name.starts_with(QUERY_PARAMETER_NAME_PREFIX)) + query_parameters.emplace(convertToQueryParameter(std::move(current))); + else + changes.push_back(std::move(current)); } auto query = std::make_shared(); @@ -128,6 +160,7 @@ bool ParserSetQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) query->is_standalone = !parse_only_internals; query->changes = std::move(changes); + query->query_parameters = std::move(query_parameters); return true; } diff --git a/src/Parsers/ParserSetQuery.h b/src/Parsers/ParserSetQuery.h index 0bc1cec3093..d9c69358ac2 100644 --- a/src/Parsers/ParserSetQuery.h +++ b/src/Parsers/ParserSetQuery.h @@ -9,6 +9,8 @@ namespace DB struct SettingChange; +constexpr char QUERY_PARAMETER_NAME_PREFIX[] = "param_"; + /** Query like this: * SET name1 = value1, name2 = value2, ... */ diff --git a/src/QueryPipeline/RemoteInserter.cpp b/src/QueryPipeline/RemoteInserter.cpp index 58fed6e5466..cd0485ec8e3 100644 --- a/src/QueryPipeline/RemoteInserter.cpp +++ b/src/QueryPipeline/RemoteInserter.cpp @@ -67,7 +67,8 @@ RemoteInserter::RemoteInserter( /** Send query and receive "header", that describes table structure. * Header is needed to know, what structure is required for blocks to be passed to 'write' method. */ - connection.sendQuery(timeouts, query, "", QueryProcessingStage::Complete, &settings, &modified_client_info, false, {}); + connection.sendQuery( + timeouts, query, /* query_parameters */ {}, "", QueryProcessingStage::Complete, &settings, &modified_client_info, false, {}); while (true) { diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index cdf856e87d5..5b8e17eb279 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -1014,10 +1015,10 @@ bool DynamicQueryHandler::customizeQueryParam(ContextMutablePtr context, const s if (key == param_name) return true; /// do nothing - if (startsWith(key, "param_")) + if (startsWith(key, QUERY_PARAMETER_NAME_PREFIX)) { /// Save name and values of substitution in dictionary. - const String parameter_name = key.substr(strlen("param_")); + const String parameter_name = key.substr(strlen(QUERY_PARAMETER_NAME_PREFIX)); if (!context->getQueryParameters().contains(parameter_name)) context->setQueryParameter(parameter_name, value); diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 2f16148e0a2..4c6eb1a253b 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -57,6 +57,7 @@ #include using namespace std::literals; +using namespace DB; namespace CurrentMetrics @@ -64,6 +65,23 @@ namespace CurrentMetrics extern const Metric QueryThread; } +namespace +{ +NameToNameMap convertToQueryParameters(const Settings & passed_params) +{ + NameToNameMap query_parameters; + for (const auto & param : passed_params) + { + std::string value; + ReadBufferFromOwnString buf(param.getValueString()); + readQuoted(value, buf); + query_parameters.emplace(param.getName(), value); + } + return query_parameters; +} + +} + namespace DB { @@ -1334,6 +1352,10 @@ void TCPHandler::receiveQuery() readStringBinary(state.query, *in); + Settings passed_params; + if (client_tcp_protocol_version >= DBMS_MIN_PROTOCOL_VERSION_WITH_PARAMETERS) + passed_params.read(*in, settings_format); + /// TODO Unify interserver authentication (and make sure that it's secure enough) if (is_interserver_mode) { @@ -1424,6 +1446,8 @@ void TCPHandler::receiveQuery() /// so we have to apply the changes first. query_context->setCurrentQueryId(state.query_id); + query_context->addQueryParameters(convertToQueryParameters(passed_params)); + /// For testing hedged requests if (unlikely(sleep_after_receiving_query.totalMilliseconds())) { @@ -1460,6 +1484,9 @@ void TCPHandler::receiveUnexpectedQuery() readStringBinary(skip_string, *in); + if (client_tcp_protocol_version >= DBMS_MIN_PROTOCOL_VERSION_WITH_PARAMETERS) + skip_settings.read(*in, settings_format); + throw NetException("Unexpected packet Query received from client", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT); } diff --git a/tests/queries/0_stateless/00955_complex_prepared_statements.reference b/tests/queries/0_stateless/00955_complex_prepared_statements.reference index 701cc5f8781..257526768a0 100644 --- a/tests/queries/0_stateless/00955_complex_prepared_statements.reference +++ b/tests/queries/0_stateless/00955_complex_prepared_statements.reference @@ -4,3 +4,4 @@ [[10],[10],[10]] [10,10,10] [[10],[10],[10]] (10,'Test') (10,('dt',10)) 2015-02-15 Code: 457. +Code: 457. diff --git a/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.reference b/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.reference new file mode 100644 index 00000000000..f46cdb6e5e3 --- /dev/null +++ b/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.reference @@ -0,0 +1,9 @@ +42 hello 2022-08-04 18:30:53 {'2b95a497-3a5d-49af-bf85-15763318cde7':[1.2,3.4]} +UInt64 String DateTime Map(UUID, Array(Float32)) +42 [1,2,3] {'abc':22,'def':33} [[4,5,6],[7],[8,9]] {10:[11,12],13:[14,15]} {'ghj':{'klm':[16,17]},'nop':{'rst':[18]}} +5 +42 +13 +13 str 2022-08-04 18:30:53 {'10':[11,12],'13':[14,15]} +1 +1 diff --git a/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.sh b/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.sh new file mode 100755 index 00000000000..335af1bb6e6 --- /dev/null +++ b/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +# shellcheck disable=SC2154 + +unset CLICKHOUSE_LOG_COMMENT + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT \ + --param_num="42" \ + --param_str="hello" \ + --param_date="2022-08-04 18:30:53" \ + --param_map="{'2b95a497-3a5d-49af-bf85-15763318cde7': [1.2, 3.4]}" \ + -q "select {num:UInt64}, {str:String}, {date:DateTime}, {map:Map(UUID, Array(Float32))}" + + +$CLICKHOUSE_CLIENT \ + --param_num="42" \ + --param_str="hello" \ + --param_date="2022-08-04 18:30:53" \ + --param_map="{'2b95a497-3a5d-49af-bf85-15763318cde7': [1.2, 3.4]}" \ + -q "select toTypeName({num:UInt64}), toTypeName({str:String}), toTypeName({date:DateTime}), toTypeName({map:Map(UUID, Array(Float32))})" + + +table_name="t_02377_extend_protocol_with_query_parameters_$RANDOM$RANDOM" +$CLICKHOUSE_CLIENT -n -q " + create table $table_name( + id Int64, + arr Array(UInt8), + map Map(String, UInt8), + mul_arr Array(Array(UInt8)), + map_arr Map(UInt8, Array(UInt8)), + map_map_arr Map(String, Map(String, Array(UInt8)))) + engine = MergeTree + order by (id)" + + +$CLICKHOUSE_CLIENT \ + --param_id="42" \ + --param_arr="[1, 2, 3]" \ + --param_map="{'abc': 22, 'def': 33}" \ + --param_mul_arr="[[4, 5, 6], [7], [8, 9]]" \ + --param_map_arr="{10: [11, 12], 13: [14, 15]}" \ + --param_map_map_arr="{'ghj': {'klm': [16, 17]}, 'nop': {'rst': [18]}}" \ + -q "insert into $table_name values({id: Int64}, {arr: Array(UInt8)}, {map: Map(String, UInt8)}, {mul_arr: Array(Array(UInt8))}, {map_arr: Map(UInt8, Array(UInt8))}, {map_map_arr: Map(String, Map(String, Array(UInt8)))})" + + +$CLICKHOUSE_CLIENT -q "select * from $table_name" + + +$CLICKHOUSE_CLIENT \ + --param_tbl="numbers" \ + --param_db="system" \ + --param_col="number" \ + -q "select {col:Identifier} from {db:Identifier}.{tbl:Identifier} limit 1 offset 5" + + +# it is possible to set parameter for the current session +$CLICKHOUSE_CLIENT -n -q "set param_n = 42; select {n: UInt8}" +# and it will not be visible to other sessions +$CLICKHOUSE_CLIENT -n -q "select {n: UInt8} -- { serverError 456 }" + + +# the same parameter could be set multiple times within one session (new value overrides the previous one) +$CLICKHOUSE_CLIENT -n -q "set param_n = 12; set param_n = 13; select {n: UInt8}" + + +# but multiple different parameters could be defined within each session +$CLICKHOUSE_CLIENT -n -q " + set param_a = 13, param_b = 'str'; + set param_c = '2022-08-04 18:30:53'; + set param_d = '{\'10\': [11, 12], \'13\': [14, 15]}'; + select {a: UInt32}, {b: String}, {c: DateTime}, {d: Map(String, Array(UInt8))}" + +# empty parameter name is not allowed +$CLICKHOUSE_CLIENT --param_="" -q "select 1" 2>&1 | grep -c 'Code: 36' +$CLICKHOUSE_CLIENT -q "set param_ = ''" 2>&1 | grep -c 'Code: 36'