From bdd8bcc0d9b68474ca10df52772babc5aa1a20d4 Mon Sep 17 00:00:00 2001 From: Konstantin Morozov Date: Fri, 7 Jun 2024 15:51:13 +0000 Subject: [PATCH 01/70] add some log --- src/Databases/DatabaseAtomic.cpp | 7 +++++++ src/Storages/StorageReplicatedMergeTree.cpp | 19 ++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index ccab72cfbae..d431eb5c1b7 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -112,6 +112,13 @@ StoragePtr DatabaseAtomic::detachTable(ContextPtr /* context */, const String & table_name_to_path.erase(name); detached_tables.emplace(table->getStorageID().uuid, table); not_in_use = cleanupDetachedTables(); + + if (!not_in_use.empty()) + { + not_in_use.clear(); + LOG_DEBUG(log, "Finish removing non using detached tables"); + } + return table; } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index e18e66d7af9..68bb5916d7c 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5,20 +5,21 @@ #include #include +#include +#include #include #include #include #include +#include #include #include #include #include #include +#include #include #include -#include -#include -#include #include @@ -5272,6 +5273,8 @@ void StorageReplicatedMergeTree::flushAndPrepareForShutdown() if (shutdown_prepared_called.exchange(true)) return; + LOG_TRACE(log, "Start preparing for shutdown"); + try { auto settings_ptr = getSettings(); @@ -5282,7 +5285,11 @@ void StorageReplicatedMergeTree::flushAndPrepareForShutdown() stopBeingLeader(); if (attach_thread) + { attach_thread->shutdown(); + LOG_TRACE(log, "Attach thread shutdowned"); + } + restarting_thread.shutdown(/* part_of_full_shutdown */true); /// Explicitly set the event, because the restarting thread will not set it again @@ -5295,6 +5302,8 @@ void StorageReplicatedMergeTree::flushAndPrepareForShutdown() shutdown_deadline.emplace(std::chrono::system_clock::now()); throw; } + + LOG_TRACE(log, "Finish preparing for shutdown"); } void StorageReplicatedMergeTree::partialShutdown() @@ -5332,6 +5341,9 @@ void StorageReplicatedMergeTree::shutdown(bool) if (shutdown_called.exchange(true)) return; + const auto storage_name = getStorageID().getNameForLogs(); + LOG_TRACE(log, "Shutdown started, table={}", storage_name); + flushAndPrepareForShutdown(); if (!shutdown_deadline.has_value()) @@ -5374,6 +5386,7 @@ void StorageReplicatedMergeTree::shutdown(bool) /// Wait for all of them std::lock_guard lock(data_parts_exchange_ptr->rwlock); } + LOG_TRACE(log, "Shutdown finished, table={}", storage_name); } From 8af89e6e6d919cf4f0c1eb4a5372ab49dfd9b144 Mon Sep 17 00:00:00 2001 From: Konstantin Morozov Date: Thu, 13 Jun 2024 13:22:25 +0000 Subject: [PATCH 02/70] apply comments --- src/Databases/DatabaseAtomic.cpp | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index d431eb5c1b7..b30b05bb7a7 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -116,7 +116,7 @@ StoragePtr DatabaseAtomic::detachTable(ContextPtr /* context */, const String & if (!not_in_use.empty()) { not_in_use.clear(); - LOG_DEBUG(log, "Finish removing non using detached tables"); + LOG_DEBUG(log, "Finished removing not used detached tables"); } return table; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 68bb5916d7c..9b914e3de8f 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5,17 +5,15 @@ #include #include -#include -#include #include #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -5287,7 +5285,7 @@ void StorageReplicatedMergeTree::flushAndPrepareForShutdown() if (attach_thread) { attach_thread->shutdown(); - LOG_TRACE(log, "Attach thread shutdowned"); + LOG_TRACE(log, "The attach thread is shutdown"); } @@ -5303,7 +5301,7 @@ void StorageReplicatedMergeTree::flushAndPrepareForShutdown() throw; } - LOG_TRACE(log, "Finish preparing for shutdown"); + LOG_TRACE(log, "Finished preparing for shutdown"); } void StorageReplicatedMergeTree::partialShutdown() @@ -5342,7 +5340,7 @@ void StorageReplicatedMergeTree::shutdown(bool) return; const auto storage_name = getStorageID().getNameForLogs(); - LOG_TRACE(log, "Shutdown started, table={}", storage_name); + LOG_TRACE(log, "Shutdown started"); flushAndPrepareForShutdown(); @@ -5386,7 +5384,7 @@ void StorageReplicatedMergeTree::shutdown(bool) /// Wait for all of them std::lock_guard lock(data_parts_exchange_ptr->rwlock); } - LOG_TRACE(log, "Shutdown finished, table={}", storage_name); + LOG_TRACE(log, "Shutdown finished"); } From f7eac01b822c94184a16dfda1685d95f05c5cc8a Mon Sep 17 00:00:00 2001 From: Konstantin Morozov Date: Thu, 13 Jun 2024 13:31:52 +0000 Subject: [PATCH 03/70] up includes --- src/Storages/StorageReplicatedMergeTree.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index b33514907f9..a1f4a40a0ab 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5,10 +5,12 @@ #include #include +#include #include #include #include #include +#include #include #include #include From 96fd928bced6c14e9de98ad14b77b370ed14de8e Mon Sep 17 00:00:00 2001 From: Konstantin Morozov Date: Wed, 19 Jun 2024 08:59:48 +0000 Subject: [PATCH 04/70] remove unused var --- src/Storages/StorageReplicatedMergeTree.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a1f4a40a0ab..61a492c1f63 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5341,7 +5341,6 @@ void StorageReplicatedMergeTree::shutdown(bool) if (shutdown_called.exchange(true)) return; - const auto storage_name = getStorageID().getNameForLogs(); LOG_TRACE(log, "Shutdown started"); flushAndPrepareForShutdown(); From aa7017a7fb1dcc09f6d7f948d3adb2d65a7b5201 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 1 Jul 2024 00:32:39 +0200 Subject: [PATCH 05/70] Add a test for #43003 --- .../03199_join_with_materialized_column.reference | 0 .../0_stateless/03199_join_with_materialized_column.sql | 6 ++++++ 2 files changed, 6 insertions(+) create mode 100644 tests/queries/0_stateless/03199_join_with_materialized_column.reference create mode 100644 tests/queries/0_stateless/03199_join_with_materialized_column.sql diff --git a/tests/queries/0_stateless/03199_join_with_materialized_column.reference b/tests/queries/0_stateless/03199_join_with_materialized_column.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03199_join_with_materialized_column.sql b/tests/queries/0_stateless/03199_join_with_materialized_column.sql new file mode 100644 index 00000000000..8c53c5b3e66 --- /dev/null +++ b/tests/queries/0_stateless/03199_join_with_materialized_column.sql @@ -0,0 +1,6 @@ +SET allow_experimental_analyzer = 1; + +DROP TABLE IF EXISTS table_with_materialized; +CREATE TABLE table_with_materialized (col String MATERIALIZED 'A') ENGINE = Memory; +SELECT number FROM numbers(1) AS n, table_with_materialized; +DROP TABLE table_with_materialized; From ea3b0e735de285db89cb36e2782db88c6d403ee2 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 3 Jul 2024 13:40:41 +0000 Subject: [PATCH 06/70] Refactor JSONExtract functions and support more types and reuse its code in new JSON type --- docs/en/sql-reference/data-types/dynamic.md | 34 + src/Common/JSONParsers/SimdJSONParser.h | 1 + src/DataTypes/DataTypeDynamic.cpp | 1 + src/Formats/JSONExtractTree.cpp | 1561 +++++++++ src/Formats/JSONExtractTree.h | 35 + src/Formats/SchemaInferenceUtils.cpp | 94 +- src/Formats/SchemaInferenceUtils.h | 10 + src/Functions/FunctionsJSON.cpp | 1061 +++++- src/Functions/FunctionsJSON.h | 3054 +++++++---------- .../03198_json_extract_more_types.reference | 21 + .../03198_json_extract_more_types.sql | 29 + .../03199_json_extract_dynamic.reference | 30 + .../03199_json_extract_dynamic.sql | 37 + 13 files changed, 4158 insertions(+), 1810 deletions(-) create mode 100644 src/Formats/JSONExtractTree.cpp create mode 100644 src/Formats/JSONExtractTree.h create mode 100644 tests/queries/0_stateless/03198_json_extract_more_types.reference create mode 100644 tests/queries/0_stateless/03198_json_extract_more_types.sql create mode 100644 tests/queries/0_stateless/03199_json_extract_dynamic.reference create mode 100644 tests/queries/0_stateless/03199_json_extract_dynamic.sql diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md index 955fd54e641..e063bed2de4 100644 --- a/docs/en/sql-reference/data-types/dynamic.md +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -493,3 +493,37 @@ SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) O ``` As we can see, ClickHouse kept the most frequent types `UInt64` and `Array(UInt64)` and casted all other types to `String`. + + +## JSONExtract functions with Dynamic + +All `JSONExtract*` functions support `Dynamic` type: + +```sql +SELECT JSONExtract('{"a" : [1, 2, 3]}', 'a', 'Dynamic') AS dynamic, dynamicType(dynamic) AS dynamic_type; +``` + +```text +┌─dynamic─┬─dynamic_type───────────┐ +│ [1,2,3] │ Array(Nullable(Int64)) │ +└─────────┴────────────────────────┘ +``` + +```sql +SELECT JSONExtract('{"obj" : {"a" : 42, "b" : "Hello", "c" : [1,2,3]}}', 'obj', 'Map(String, Variant(UInt32, String, Array(UInt32)))') AS map_of_dynamics, mapApply((k, v) -> (k, variantType(v)), map_of_dynamics) AS map_of_dynamic_types``` + +```text +┌─map_of_dynamics──────────────────┬─map_of_dynamic_types────────────────────────────┐ +│ {'a':42,'b':'Hello','c':[1,2,3]} │ {'a':'UInt32','b':'String','c':'Array(UInt32)'} │ +└──────────────────────────────────┴─────────────────────────────────────────────────┘ +``` + +```sql +SELECT JSONExtractKeysAndValues('{"a" : 42, "b" : "Hello", "c" : [1,2,3]}', 'Variant(UInt32, String, Array(UInt32))') AS dynamics, arrayMap(x -> (x.1, variantType(x.2)), dynamics) AS dynamic_types``` +``` + +```text +┌─dynamics───────────────────────────────┬─dynamic_types─────────────────────────────────────────┐ +│ [('a',42),('b','Hello'),('c',[1,2,3])] │ [('a','UInt32'),('b','String'),('c','Array(UInt32)')] │ +└────────────────────────────────────────┴───────────────────────────────────────────────────────┘ +``` diff --git a/src/Common/JSONParsers/SimdJSONParser.h b/src/Common/JSONParsers/SimdJSONParser.h index 827d142266a..db679b14f52 100644 --- a/src/Common/JSONParsers/SimdJSONParser.h +++ b/src/Common/JSONParsers/SimdJSONParser.h @@ -14,6 +14,7 @@ namespace DB { + namespace ErrorCodes { extern const int CANNOT_ALLOCATE_MEMORY; diff --git a/src/DataTypes/DataTypeDynamic.cpp b/src/DataTypes/DataTypeDynamic.cpp index c920e69c13b..6826c46a1a7 100644 --- a/src/DataTypes/DataTypeDynamic.cpp +++ b/src/DataTypes/DataTypeDynamic.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace DB { diff --git a/src/Formats/JSONExtractTree.cpp b/src/Formats/JSONExtractTree.cpp new file mode 100644 index 00000000000..6d019f96ba6 --- /dev/null +++ b/src/Formats/JSONExtractTree.cpp @@ -0,0 +1,1561 @@ +#include +#include + +#include +#if USE_SIMDJSON +#include +#endif +#if USE_RAPIDJSON +#include +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include + +namespace DB +{ + +template +void jsonElementToString(const typename JSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings) +{ + if (element.isInt64()) + { + writeIntText(element.getInt64(), buf); + return; + } + if (element.isUInt64()) + { + writeIntText(element.getUInt64(), buf); + return; + } + if (element.isDouble()) + { + writeFloatText(element.getDouble(), buf); + return; + } + if (element.isBool()) + { + if (element.getBool()) + writeCString("true", buf); + else + writeCString("false", buf); + return; + } + if (element.isString()) + { + writeJSONString(element.getString(), buf, format_settings); + return; + } + if (element.isArray()) + { + writeChar('[', buf); + bool need_comma = false; + for (auto value : element.getArray()) + { + if (std::exchange(need_comma, true)) + writeChar(',', buf); + jsonElementToString(value, buf, format_settings); + } + writeChar(']', buf); + return; + } + if (element.isObject()) + { + writeChar('{', buf); + bool need_comma = false; + for (auto [key, value] : element.getObject()) + { + if (std::exchange(need_comma, true)) + writeChar(',', buf); + writeJSONString(key, buf, format_settings); + writeChar(':', buf); + jsonElementToString(value, buf, format_settings); + } + writeChar('}', buf); + return; + } + if (element.isNull()) + { + writeCString("null", buf); + return; + } +} + +template +bool tryGetNumericValueFromJSONElement( + NumberType & value, + const typename JSONParser::Element & element, + bool convert_bool_to_integer, + String & error) +{ + switch (element.type()) + { + case ElementType::DOUBLE: + if constexpr (std::is_floating_point_v) + { + /// We permit inaccurate conversion of double to float. + /// Example: double 0.1 from JSON is not representable in float. + /// But it will be more convenient for user to perform conversion. + value = static_cast(element.getDouble()); + } + else if (!accurate::convertNumeric(element.getDouble(), value)) + { + error = fmt::format("cannot convert double value {} to {}", element.getDouble(), TypeName); + return false; + } + break; + case ElementType::UINT64: + if (!accurate::convertNumeric(element.getUInt64(), value)) + { + error = fmt::format("cannot convert UInt64 value {} to {}", element.getUInt64(), TypeName); + return false; + } + break; + case ElementType::INT64: + if (!accurate::convertNumeric(element.getInt64(), value)) + { + error = fmt::format("cannot convert Int64 value {} to {}", element.getInt64(), TypeName); + return false; + } + break; + case ElementType::BOOL: + if constexpr (is_integer) + { + if (convert_bool_to_integer) + { + value = static_cast(element.getBool()); + break; + } + } + error = fmt::format("cannot convert bool value to {}", TypeName); + return false; + case ElementType::STRING: { + auto rb = ReadBufferFromMemory{element.getString()}; + if constexpr (std::is_floating_point_v) + { + if (!tryReadFloatText(value, rb) || !rb.eof()) + { + error = fmt::format("cannot parse {} value here: {}", TypeName, element.getString()); + return false; + } + } + else + { + if (tryReadIntText(value, rb) && rb.eof()) + break; + + /// Try to parse float and convert it to integer. + Float64 tmp_float; + rb.position() = rb.buffer().begin(); + if (!tryReadFloatText(tmp_float, rb) || !rb.eof()) + { + error = fmt::format("cannot parse {} value here: {}", TypeName, element.getString()); + return false; + } + + if (!accurate::convertNumeric(tmp_float, value)) + { + error = fmt::format("cannot parse {} value here: {}", TypeName, element.getString()); + return false; + } + } + break; + } + default: + return false; + } + + return true; +} + +namespace +{ + +template +String jsonElementToString(const typename JSONParser::Element & element, const FormatSettings & format_settings) +{ + WriteBufferFromOwnString buf; + jsonElementToString(element, buf, format_settings); + return buf.str(); +} + +template +class NumericNode : public JSONExtractTreeNode +{ +public: + explicit NumericNode(bool is_bool_type_ = false) : is_bool_type(is_bool_type_) { } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull()) + { + if (format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + error = fmt::format("cannot parse {} value from null", TypeName); + return false; + } + + NumberType value; + if (!tryGetNumericValueFromJSONElement(value, element, insert_settings.convert_bool_to_integer || is_bool_type, error)) + { + if (error.empty()) + error = fmt::format("cannot read {} value from JSON element: {}", TypeName, jsonElementToString(element, format_settings)); + return false; + } + + if (is_bool_type) + value = static_cast(value); + + auto & col_vec = assert_cast &>(column); + col_vec.insertValue(value); + return true; + } + +protected: + bool is_bool_type; +}; + +template +class LowCardinalityNumericNode : public NumericNode +{ +public: + explicit LowCardinalityNumericNode(bool is_nullable_, bool is_bool_type_ = false) + : NumericNode(is_bool_type_), is_nullable(is_nullable_) + { + } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull()) + { + if (is_nullable || format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + error = fmt::format("cannot parse {} value from null", TypeName); + return false; + } + + NumberType value; + if (!tryGetNumericValueFromJSONElement(value, element, insert_settings.convert_bool_to_integer || this->is_bool_type, error)) + { + if (error.empty()) + error = fmt::format("cannot read {} value from JSON element: {}", TypeName, jsonElementToString(element, format_settings)); + return false; + } + + if (this->is_bool_type) + value = static_cast(value); + + auto & col_lc = assert_cast(column); + col_lc.insertData(reinterpret_cast(&value), sizeof(value)); + return true; + } + +private: + bool is_nullable; +}; + +template +class StringNode : public JSONExtractTreeNode +{ +public: + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull()) + { + if (format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + error = "cannot parse String value from null"; + return false; + } + + if (!element.isString()) + { + auto & col_str = assert_cast(column); + auto & chars = col_str.getChars(); + WriteBufferFromVector buf(chars, AppendModeTag()); + jsonElementToString(element, buf, format_settings); + buf.finalize(); + chars.push_back(0); + col_str.getOffsets().push_back(chars.size()); + } + else + { + auto value = element.getString(); + auto & col_str = assert_cast(column); + col_str.insertData(value.data(), value.size()); + } + return true; + } +}; + +template +class LowCardinalityStringNode : public JSONExtractTreeNode +{ +public: + explicit LowCardinalityStringNode(bool is_nullable_) : is_nullable(is_nullable_) { } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull()) + { + if (is_nullable || format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + error = "cannot parse String value from null"; + return false; + } + + if (!element.isString()) + { + auto value = jsonElementToString(element, format_settings); + assert_cast(column).insertData(value.data(), value.size()); + } + else + { + auto value = element.getString(); + assert_cast(column).insertData(value.data(), value.size()); + } + + return true; + } + +private: + bool is_nullable; +}; + +template +class FixedStringNode : public JSONExtractTreeNode +{ +public: + explicit FixedStringNode(size_t fixed_length_) : fixed_length(fixed_length_) { } + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull()) + { + if (format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + error = "cannot parse FixedString value from null"; + return false; + } + + if (!element.isString()) + return checkValueSizeAndInsert(column, jsonElementToString(element, format_settings), error); + return checkValueSizeAndInsert(column, element.getString(), error); + } + +private: + template + bool checkValueSizeAndInsert(IColumn & column, const T & value, String & error) const + { + if (value.size() > fixed_length) + { + error = fmt::format("too large string for FixedString({}): {}", fixed_length, value); + return false; + } + assert_cast(column).insertData(value.data(), value.size()); + return true; + } + + size_t fixed_length; +}; + +template +class LowCardinalityFixedStringNode : public JSONExtractTreeNode +{ +public: + explicit LowCardinalityFixedStringNode(bool is_nullable_, size_t fixed_length_) : is_nullable(is_nullable_), fixed_length(fixed_length_) + { + } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull()) + { + if (is_nullable || format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + error = "cannot parse FixedString value from null"; + return false; + } + + if (!element.isString()) + return checkValueSizeAndInsert(column, jsonElementToString(element, format_settings), error); + return checkValueSizeAndInsert(column, element.getString(), error); + } + +private: + template + bool checkValueSizeAndInsert(IColumn & column, const T & value, String & error) const + { + if (value.size() > fixed_length) + { + error = fmt::format("too large string for FixedString({}): {}", fixed_length, value); + return false; + } + + // For the non low cardinality case of FixedString, the padding is done in the FixedString Column implementation. + // In order to avoid having to pass the data to a FixedString Column and read it back (which would slow down the execution) + // the data is padded here and written directly to the Low Cardinality Column + if (value.size() == fixed_length) + { + assert_cast(column).insertData(value.data(), value.size()); + } + else + { + String padded_value(value); + padded_value.resize(fixed_length, '\0'); + assert_cast(column).insertData(padded_value.data(), padded_value.size()); + } + return true; + } + + bool is_nullable; + size_t fixed_length; +}; + +template +class UUIDNode : public JSONExtractTreeNode +{ +public: + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull() && format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + if (!element.isString()) + { + error = fmt::format("cannot read UUID value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + + auto data = element.getString(); + UUID uuid; + if (!tryParse(uuid, data)) + { + error = fmt::format("cannot parse UUID value here: {}", data); + return false; + } + + assert_cast(column).insert(uuid); + return true; + } + + + static bool tryParse(UUID & uuid, std::string_view data) + { + ReadBufferFromMemory buf(data.data(), data.size()); + return tryReadUUIDText(uuid, buf) && buf.eof(); + } +}; + +template +class LowCardinalityUUIDNode : public JSONExtractTreeNode +{ +public: + explicit LowCardinalityUUIDNode(bool is_nullable_) : is_nullable(is_nullable_) { } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull() && (is_nullable || format_settings.null_as_default)) + { + column.insertDefault(); + return true; + } + + if (!element.isString()) + { + error = fmt::format("cannot read UUID value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + + auto data = element.getString(); + ReadBufferFromMemory buf(data.data(), data.size()); + UUID uuid; + if (!tryReadUUIDText(uuid, buf) || !buf.eof()) + { + error = fmt::format("cannot parse UUID value here: {}", data); + return false; + } + assert_cast(column).insertData(reinterpret_cast(&uuid), sizeof(uuid)); + return true; + } + +private: + bool is_nullable; +}; + +template +class DateNode : public JSONExtractTreeNode +{ +public: + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull() && format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + if (!element.isString()) + { + error = fmt::format("cannot read Date value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + + auto data = element.getString(); + ReadBufferFromMemory buf(data.data(), data.size()); + DateType date; + if (!tryReadDateText(date, buf) || !buf.eof()) + { + error = fmt::format("cannot parse Date value here: {}", data); + return false; + } + + assert_cast &>(column).insertValue(date); + return true; + } +}; + +template +class DateTimeNode : public JSONExtractTreeNode, public TimezoneMixin +{ +public: + explicit DateTimeNode(const DataTypeDateTime & datetime_type) : TimezoneMixin(datetime_type) { } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull() && format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + time_t value; + if (element.isString()) + { + if (!tryParse(value, element.getString(), format_settings.date_time_input_format)) + { + error = fmt::format("cannot parse DateTime value here: {}", element.getString()); + return false; + } + } + else if (element.isUInt64()) + { + value = element.getUInt64(); + } + else + { + error = fmt::format("cannot read DateTime value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + + assert_cast(column).insert(value); + return true; + } + + bool tryParse(time_t & value, std::string_view data, FormatSettings::DateTimeInputFormat date_time_input_format) const + { + ReadBufferFromMemory buf(data.data(), data.size()); + switch (date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + if (tryReadDateTimeText(value, buf, time_zone) && buf.eof()) + return true; + break; + case FormatSettings::DateTimeInputFormat::BestEffort: + if (tryParseDateTimeBestEffort(value, buf, time_zone, utc_time_zone) && buf.eof()) + return true; + break; + case FormatSettings::DateTimeInputFormat::BestEffortUS: + if (tryParseDateTimeBestEffortUS(value, buf, time_zone, utc_time_zone) && buf.eof()) + return true; + break; + } + + return false; + } +}; + +template +class DecimalNode : public JSONExtractTreeNode +{ +public: + explicit DecimalNode(const DataTypePtr & type) : scale(assert_cast &>(*type).getScale()) { } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + DecimalType value{}; + + switch (element.type()) + { + case ElementType::DOUBLE: + value = convertToDecimal, DataTypeDecimal>(element.getDouble(), scale); + break; + case ElementType::UINT64: + value = convertToDecimal, DataTypeDecimal>(element.getUInt64(), scale); + break; + case ElementType::INT64: + value = convertToDecimal, DataTypeDecimal>(element.getInt64(), scale); + break; + case ElementType::STRING: { + auto rb = ReadBufferFromMemory{element.getString()}; + if (!SerializationDecimal::tryReadText(value, rb, DecimalUtils::max_precision, scale)) + { + error = fmt::format("cannot parse Decimal value here: {}", element.getString()); + return false; + } + break; + } + case ElementType::NULL_VALUE: { + if (!format_settings.null_as_default) + { + error = "cannot convert null to Decimal value"; + return false; + } + break; + } + default: { + error = fmt::format("cannot read Decimal value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + } + + assert_cast &>(column).insertValue(value); + return true; + } + +private: + UInt32 scale; +}; + + +template +class DateTime64Node : public JSONExtractTreeNode, public TimezoneMixin +{ +public: + explicit DateTime64Node(const DataTypeDateTime64 & datetime64_type) : TimezoneMixin(datetime64_type), scale(datetime64_type.getScale()) { } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull() && format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + DateTime64 value; + if (element.isString()) + { + if (!tryParse(value, element.getString(), format_settings.date_time_input_format)) + { + error = fmt::format("cannot parse DateTime64 value here: {}", element.getString()); + return false; + } + } + else + { + switch (element.type()) + { + case ElementType::DOUBLE: + value = convertToDecimal, DataTypeDecimal>(element.getDouble(), scale); + break; + case ElementType::UINT64: + value = convertToDecimal, DataTypeDecimal>(element.getUInt64(), scale); + break; + case ElementType::INT64: + value = convertToDecimal, DataTypeDecimal>(element.getInt64(), scale); + break; + default: + error = fmt::format("cannot read DateTime64 value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + } + + assert_cast(column).insert(value); + return true; + } + + bool tryParse(DateTime64 & value, std::string_view data, FormatSettings::DateTimeInputFormat date_time_input_format) const + { + ReadBufferFromMemory buf(data.data(), data.size()); + switch (date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + if (tryReadDateTime64Text(value, scale, buf, time_zone) && buf.eof()) + return true; + break; + case FormatSettings::DateTimeInputFormat::BestEffort: + if (tryParseDateTime64BestEffort(value, scale, buf, time_zone, utc_time_zone) && buf.eof()) + return true; + break; + case FormatSettings::DateTimeInputFormat::BestEffortUS: + if (tryParseDateTime64BestEffortUS(value, scale, buf, time_zone, utc_time_zone) && buf.eof()) + return true; + break; + } + + return false; + } + +private: + UInt32 scale; +}; + +template +class EnumNode : public JSONExtractTreeNode +{ +public: + explicit EnumNode(const std::vector> & name_value_pairs_) : name_value_pairs(name_value_pairs_) + { + for (const auto & name_value_pair : name_value_pairs) + { + name_to_value_map.emplace(name_value_pair.first, name_value_pair.second); + only_values.emplace(name_value_pair.second); + } + } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull()) + { + if (format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + error = "cannot convert null to Enum value"; + return false; + } + + auto & col_vec = assert_cast &>(column); + + if (element.isInt64()) + { + Type value; + if (!accurate::convertNumeric(element.getInt64(), value) || !only_values.contains(value)) + { + error = fmt::format("cannot convert value {} to enum: there is no such value in enum", element.getInt64()); + return false; + } + col_vec.insertValue(value); + return true; + } + + if (element.isUInt64()) + { + Type value; + if (!accurate::convertNumeric(element.getUInt64(), value) || !only_values.contains(value)) + { + error = fmt::format("cannot convert value {} to enum: there is no such value in enum", element.getUInt64()); + return false; + } + col_vec.insertValue(value); + return true; + } + + if (element.isString()) + { + auto value = name_to_value_map.find(element.getString()); + if (value == name_to_value_map.end()) + { + error = fmt::format("cannot convert value {} to enum: there is no such value in enum", element.getString()); + return false; + } + col_vec.insertValue(value->second); + return true; + } + + error = fmt::format("cannot read Enum value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + +private: + std::vector> name_value_pairs; + std::unordered_map name_to_value_map; + std::unordered_set only_values; +}; + +template +class IPv4Node : public JSONExtractTreeNode +{ +public: + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull() && format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + if (!element.isString()) + { + error = fmt::format("cannot read IPv4 value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + + auto data = element.getString(); + IPv4 value; + if (!tryParse(value, data)) + { + error = fmt::format("cannot parse IPv4 value here: {}", data); + return false; + } + + assert_cast(column).insert(value); + return true; + } + + static bool tryParse(IPv4 & value, std::string_view data) + { + ReadBufferFromMemory buf(data.data(), data.size()); + return tryReadIPv4Text(value, buf) && buf.eof(); + } +}; + +template +class IPv6Node : public JSONExtractTreeNode +{ +public: + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull() && format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + if (!element.isString()) + { + error = fmt::format("cannot read IPv6 value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + + auto data = element.getString(); + IPv6 value; + if (!tryParse(value, data)) + { + error = fmt::format("cannot parse IPv6 value here: {}", data); + return false; + } + + assert_cast(column).insert(value); + return true; + } + + + static bool tryParse(IPv6 & value, std::string_view data) + { + ReadBufferFromMemory buf(data.data(), data.size()); + return tryReadIPv6Text(value, buf) && buf.eof(); + } +}; + +template +class NullableNode : public JSONExtractTreeNode +{ +public: + explicit NullableNode(std::unique_ptr> nested_) : nested(std::move(nested_)) { } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull()) + { + column.insertDefault(); + return true; + } + + auto & col_null = assert_cast(column); + if (!nested-> insertResultToColumn(col_null.getNestedColumn(), element, insert_settings, format_settings, error)) + return false; + col_null.getNullMapColumn().insertValue(0); + return true; + } + +private: + std::unique_ptr> nested; +}; + +template +class LowCardinalityNode : public JSONExtractTreeNode +{ +public: + explicit LowCardinalityNode(bool is_nullable_, std::unique_ptr> nested_) + : is_nullable(is_nullable_), nested(std::move(nested_)) + { + } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull() && (is_nullable || format_settings.null_as_default)) + { + column.insertDefault(); + return true; + } + + auto & col_lc = assert_cast(column); + auto tmp_nested = col_lc.getDictionary().getNestedColumn()->cloneEmpty(); + if (!nested-> insertResultToColumn(*tmp_nested, element, insert_settings, format_settings, error)) + return false; + + col_lc.insertFromFullColumn(*tmp_nested, 0); + return true; + } + +private: + bool is_nullable; + std::unique_ptr> nested; +}; + +template +class ArrayNode : public JSONExtractTreeNode +{ +public: + explicit ArrayNode(std::unique_ptr> nested_) : nested(std::move(nested_)) { } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + { + if (element.isNull() && format_settings.null_as_default) + { + column.insertDefault(); + return true; + } + + if (!element.isArray()) + { + error = fmt::format("cannot read Array value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + + auto array = element.getArray(); + + auto & col_arr = assert_cast(column); + auto & data = col_arr.getData(); + size_t old_size = data.size(); + bool were_valid_elements = false; + + for (auto value : array) + { + if (nested-> insertResultToColumn(data, value, insert_settings, format_settings, error)) + { + were_valid_elements = true; + } + else if (insert_settings.insert_default_on_invalid_elements_in_complex_types) + { + data.insertDefault(); + } + else + { + data.popBack(data.size() - old_size); + return false; + } + } + + if (!were_valid_elements) + { + data.popBack(data.size() - old_size); + return false; + } + + col_arr.getOffsets().push_back(data.size()); + return true; + } + +private: + std::unique_ptr> nested; +}; + +template +class TupleNode : public JSONExtractTreeNode +{ +public: + TupleNode(std::vector>> nested_, const std::vector & explicit_names_) + : nested(std::move(nested_)), explicit_names(explicit_names_) + { + for (size_t i = 0; i != explicit_names.size(); ++i) + name_to_index_map.emplace(explicit_names[i], i); + } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + { + auto & tuple = assert_cast(column); + size_t old_size = column.size(); + bool were_valid_elements = false; + + auto set_size = [&](size_t size) + { + for (size_t i = 0; i != tuple.tupleSize(); ++i) + { + auto & col = tuple.getColumn(i); + if (col.size() != size) + { + if (col.size() > size) + col.popBack(col.size() - size); + else + while (col.size() < size) + col.insertDefault(); + } + } + }; + + if (element.isArray()) + { + auto array = element.getArray(); + auto it = array.begin(); + + for (size_t index = 0; (index != nested.size()) && (it != array.end()); ++index) + { + if (nested[index]-> insertResultToColumn(tuple.getColumn(index), *it++, insert_settings, format_settings, error)) + { + were_valid_elements = true; + } + else if (insert_settings.insert_default_on_invalid_elements_in_complex_types) + { + tuple.getColumn(index).insertDefault(); + } + else + { + set_size(old_size); + error += fmt::format("(during reading tuple {} element)", index); + return false; + } + } + + set_size(old_size + static_cast(were_valid_elements)); + return were_valid_elements; + } + + if (element.isObject()) + { + auto object = element.getObject(); + if (name_to_index_map.empty()) + { + auto it = object.begin(); + for (size_t index = 0; (index != nested.size()) && (it != object.end()); ++index) + { + if (nested[index]-> insertResultToColumn(tuple.getColumn(index), (*it++).second, insert_settings, format_settings, error)) + { + were_valid_elements = true; + } + else if (insert_settings.insert_default_on_invalid_elements_in_complex_types) + { + tuple.getColumn(index).insertDefault(); + } + else + { + set_size(old_size); + error += fmt::format("(during reading tuple {} element)", index); + return false; + } + } + } + else + { + for (const auto & [key, value] : object) + { + auto index = name_to_index_map.find(key); + if (index != name_to_index_map.end()) + { + if (nested[index->second]-> insertResultToColumn(tuple.getColumn(index->second), value, insert_settings, format_settings, error)) + { + were_valid_elements = true; + } + else if (!insert_settings.insert_default_on_invalid_elements_in_complex_types) + { + set_size(old_size); + error += fmt::format("(during reading tuple element \"{}\")", key); + return false; + } + } + } + } + + set_size(old_size + static_cast(were_valid_elements)); + return were_valid_elements; + } + + error = fmt::format("cannot read Tuple value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + +private: + std::vector>> nested; + std::vector explicit_names; + std::unordered_map name_to_index_map; +}; + +template +class MapNode : public JSONExtractTreeNode +{ +public: + explicit MapNode(std::unique_ptr> value_) : value(std::move(value_)) { } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + { + if (!element.isObject()) + { + error = fmt::format("cannot read Map value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + + auto & map_col = assert_cast(column); + auto & offsets = map_col.getNestedColumn().getOffsets(); + auto & tuple_col = map_col.getNestedData(); + auto & key_col = tuple_col.getColumn(0); + auto & value_col = tuple_col.getColumn(1); + size_t old_size = tuple_col.size(); + + auto object = element.getObject(); + auto it = object.begin(); + for (; it != object.end(); ++it) + { + auto pair = *it; + + /// Insert key + key_col.insertData(pair.first.data(), pair.first.size()); + + /// Insert value + if (!value-> insertResultToColumn(value_col, pair.second, insert_settings, format_settings, error)) + { + if (insert_settings.insert_default_on_invalid_elements_in_complex_types) + { + value_col.insertDefault(); + } + else + { + key_col.popBack(key_col.size() - offsets.back()); + value_col.popBack(value_col.size() - offsets.back()); + error += fmt::format("(during reading value of key \"{}\")", pair.first); + return false; + } + } + } + + offsets.push_back(old_size + object.size()); + return true; + } + +private: + std::unique_ptr> value; +}; + +template +class VariantNode : public JSONExtractTreeNode +{ +public: + VariantNode(std::vector>> variant_nodes_, std::vector order_) + : variant_nodes(std::move(variant_nodes_)), order(std::move(order_)) + { + } + + bool insertResultToColumn( + IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + { + auto & column_variant = assert_cast(column); + for (size_t i : order) + { + auto & variant = column_variant.getVariantByGlobalDiscriminator(i); + if (variant_nodes[i]-> insertResultToColumn(variant, element, insert_settings, format_settings, error)) + { + column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(i)); + column_variant.getOffsets().push_back(variant.size() - 1); + return true; + } + } + + error = fmt::format("cannot read Map value from JSON element: {}", jsonElementToString(element, format_settings)); + return false; + } + +private: + std::vector>> variant_nodes; + /// Order in which we should try variants nodes. + /// For example, String should be always the last one. + std::vector order; +}; + + +template +class DynamicNode : public JSONExtractTreeNode +{ +public: + bool insertResultToColumn(IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + { + auto & column_dynamic = assert_cast(column); + auto & variant_column = column_dynamic.getVariantColumn(); + auto variant_info = column_dynamic.getVariantInfo(); + /// First, infer ClickHouse type for this element and add it as a new variant. + auto element_type = elementToDataType(element, format_settings); + if (column_dynamic.addNewVariant(element_type)) + { + auto node = buildJSONExtractTree(element_type, "Dynamic inference"); + auto global_discriminator = variant_info.variant_name_to_discriminator[element_type->getName()]; + auto & variant = variant_column.getVariantByGlobalDiscriminator(global_discriminator); + if (!node-> insertResultToColumn(variant, element, insert_settings, format_settings, error)) + return false; + variant_column.getLocalDiscriminators().push_back(variant_column.localDiscriminatorByGlobal(global_discriminator)); + variant_column.getOffsets().push_back(variant.size() - 1); + return true; + } + + /// We couldn't add new variant. Try to insert element into current variants. + auto variant_node = buildJSONExtractTree(variant_info.variant_type, "Dynamic inference"); + if (variant_node-> insertResultToColumn(variant_column, element, insert_settings, format_settings, error)) + return true; + + /// We couldn't insert element into any existing variant, add String variant and read value as String. + column_dynamic.addStringVariant(); + auto string_global_discriminator = variant_info.variant_name_to_discriminator["String"]; + auto & string_column = variant_column.getVariantByGlobalDiscriminator(string_global_discriminator); + if (!getStringNode()-> insertResultToColumn(string_column, element, insert_settings, format_settings, error)) + return false; + variant_column.getLocalDiscriminators().push_back(variant_column.localDiscriminatorByGlobal(string_global_discriminator)); + variant_column.getOffsets().push_back(string_column.size() - 1); + return true; + } + + static const std::unique_ptr> & getStringNode() + { + static const std::unique_ptr> string_node + = buildJSONExtractTree(std::make_shared(), "Dynamic inference"); + return string_node; + } + + static DataTypePtr elementToDataType(const typename JSONParser::Element & element, const FormatSettings & format_settings) + { + JSONInferenceInfo json_inference_info; + auto type = elementToDataTypeImpl(element, format_settings, json_inference_info); + transformFinalInferredJSONTypeIfNeeded(type, format_settings, &json_inference_info); + return type; + } + +private: + static DataTypePtr elementToDataTypeImpl(const typename JSONParser::Element & element, const FormatSettings & format_settings, JSONInferenceInfo & json_inference_info) + { + switch (element.type()) + { + case ElementType::NULL_VALUE: + return makeNullable(std::make_shared()); + case ElementType::BOOL: + return DataTypeFactory::instance().get("Bool"); + case ElementType::INT64: + { + auto type = std::make_shared(); + if (element.getInt64() < 0) + json_inference_info.negative_integers.insert(type.get()); + return type; + } + case ElementType::UINT64: + return std::make_shared(); + case ElementType::DOUBLE: + return std::make_shared(); + case ElementType::STRING: + { + auto data = element.getString(); + + if (auto type = tryInferDateOrDateTimeFromString(data, format_settings)) + return type; + + if (format_settings.json.try_infer_numbers_from_strings) + { + bool is_negative = false; + if (auto type = tryInferJSONNumberFromString(data, format_settings, &json_inference_info)) + { + json_inference_info.numbers_parsed_from_json_strings.insert(type.get()); + if (is_negative) + json_inference_info.negative_integers.insert(type.get()); + return type; + } + } + + return std::make_shared(); + } + case ElementType::ARRAY: + { + auto array = element.getArray(); + DataTypes types; + types.reserve(array.size()); + for (auto value : array) + types.push_back(makeNullableSafe(elementToDataTypeImpl(value, format_settings, json_inference_info))); + + if (types.empty()) + return std::make_shared(makeNullable(std::make_shared())); + + if (checkIfTypesAreEqual(types)) + return std::make_shared(types.back()); + + /// For JSON if we have not complete types, we should not try to transform them + /// and return it as a Tuple. + /// For example, if we have types [Nullable(Float64), Nullable(Nothing), Nullable(Float64)] + /// it can be Array(Nullable(Float64)) or Tuple(Nullable(Float64), , Nullable(Float64)) and + /// we can't determine which one it is right now. But we will be able to do it later + /// when we will have the final top level type. + /// For example, we can have JSON element [[42.42, null, 43.43], [44.44, "Some string", 45.45]] and we should + /// determine the type for this element as Tuple(Nullable(Float64), Nullable(String), Nullable(Float64)). + for (const auto & type : types) + { + if (!checkIfTypeIsComplete(type)) + return std::make_shared(types); + } + + auto types_copy = types; + transformInferredJSONTypesIfNeeded(types_copy, format_settings, &json_inference_info); + + if (checkIfTypesAreEqual(types_copy)) + return std::make_shared(types_copy.back()); + + return std::make_shared(types); + } + case ElementType::OBJECT: { + /// TODO: Use new JSON type here when it's ready. + return std::make_shared(std::make_shared(), makeNullable(std::make_shared())); + } + } + } +}; + +} + +template +std::unique_ptr> buildJSONExtractTree(const DataTypePtr & type, const char * source_for_exception_message) +{ + switch (type->getTypeId()) + { + case TypeIndex::UInt8: + return std::make_unique>(isBool(type)); + case TypeIndex::UInt16: + return std::make_unique>(); + case TypeIndex::UInt32: + return std::make_unique>(); + case TypeIndex::UInt64: + return std::make_unique>(); + case TypeIndex::UInt128: + return std::make_unique>(); + case TypeIndex::UInt256: + return std::make_unique>(); + case TypeIndex::Int8: + return std::make_unique>(); + case TypeIndex::Int16: + return std::make_unique>(); + case TypeIndex::Int32: + return std::make_unique>(); + case TypeIndex::Int64: + return std::make_unique>(); + case TypeIndex::Int128: + return std::make_unique>(); + case TypeIndex::Int256: + return std::make_unique>(); + case TypeIndex::Float32: + return std::make_unique>(); + case TypeIndex::Float64: + return std::make_unique>(); + case TypeIndex::String: + return std::make_unique>(); + case TypeIndex::FixedString: + return std::make_unique>(assert_cast(*type).getN()); + case TypeIndex::UUID: + return std::make_unique>(); + case TypeIndex::IPv4: + return std::make_unique>(); + case TypeIndex::IPv6: + return std::make_unique>(); + case TypeIndex::Date:; + return std::make_unique>(); + case TypeIndex::Date32: + return std::make_unique>(); + case TypeIndex::DateTime: + return std::make_unique>(assert_cast(*type)); + case TypeIndex::DateTime64: + return std::make_unique>(assert_cast(*type)); + case TypeIndex::Decimal32: + return std::make_unique>(type); + case TypeIndex::Decimal64: + return std::make_unique>(type); + case TypeIndex::Decimal128: + return std::make_unique>(type); + case TypeIndex::Decimal256: + return std::make_unique>(type); + case TypeIndex::Enum8: + return std::make_unique>(assert_cast(*type).getValues()); + case TypeIndex::Enum16: + return std::make_unique>(assert_cast(*type).getValues()); + case TypeIndex::LowCardinality: + { + /// To optimize inserting into LowCardinality we have special nodes for LowCardinality of numeric and string types. + auto lc_type = typeid_cast(type.get()); + auto dictionary_type = removeNullable(lc_type->getDictionaryType()); + bool is_nullable = lc_type->isLowCardinalityNullable(); + + switch (dictionary_type->getTypeId()) + { + case TypeIndex::UInt8: + return std::make_unique>(is_nullable, isBool(type)); + case TypeIndex::UInt16: + return std::make_unique>(is_nullable); + case TypeIndex::UInt32: + return std::make_unique>(is_nullable); + case TypeIndex::UInt64: + return std::make_unique>(is_nullable); + case TypeIndex::Int8: + return std::make_unique>(is_nullable); + case TypeIndex::Int16: + return std::make_unique>(is_nullable); + case TypeIndex::Int32: + return std::make_unique>(is_nullable); + case TypeIndex::Int64: + return std::make_unique>(is_nullable); + case TypeIndex::Float32: + return std::make_unique>(is_nullable); + case TypeIndex::Float64: + return std::make_unique>(is_nullable); + case TypeIndex::String: + return std::make_unique>(is_nullable); + case TypeIndex::FixedString: + return std::make_unique>(is_nullable, assert_cast(*dictionary_type).getN()); + case TypeIndex::UUID: + return std::make_unique>(is_nullable); + default: + return std::make_unique>(is_nullable, buildJSONExtractTree(dictionary_type, source_for_exception_message)); + } + } + case TypeIndex::Nullable: + return std::make_unique>(buildJSONExtractTree(assert_cast(*type).getNestedType(), source_for_exception_message)); + case TypeIndex::Array: + return std::make_unique>(buildJSONExtractTree(assert_cast(*type).getNestedType(), source_for_exception_message)); + case TypeIndex::Tuple: + { + const auto & tuple = assert_cast(*type); + const auto & tuple_elements = tuple.getElements(); + std::vector>> elements; + elements.reserve(tuple_elements.size()); + for (const auto & tuple_element : tuple_elements) + elements.emplace_back(buildJSONExtractTree(tuple_element, source_for_exception_message)); + return std::make_unique>(std::move(elements), tuple.haveExplicitNames() ? tuple.getElementNames() : Strings{}); + } + case TypeIndex::Map: + { + const auto & map_type = assert_cast(*type); + const auto & key_type = map_type.getKeyType(); + if (!isString(removeLowCardinality(key_type))) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "{} doesn't support the return type schema: {} with key type not String", + source_for_exception_message, + type->getName()); + + const auto & value_type = map_type.getValueType(); + return std::make_unique>(buildJSONExtractTree(value_type, source_for_exception_message)); + } + case TypeIndex::Variant: + { + const auto & variant_type = assert_cast(*type); + const auto & variants = variant_type.getVariants(); + std::vector>> variant_nodes; + variant_nodes.reserve(variants.size()); + for (const auto & variant : variants) + variant_nodes.push_back(buildJSONExtractTree(variant, source_for_exception_message)); + return std::make_unique>(std::move(variant_nodes), SerializationVariant::getVariantsDeserializeTextOrder(variants)); + } + case TypeIndex::Dynamic: + return std::make_unique>(); + default: + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "{} doesn't support the return type schema: {}", + source_for_exception_message, + type->getName()); + } +} + +#if USE_SIMDJSON +template void jsonElementToString(const SimdJSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings); +template std::unique_ptr> buildJSONExtractTree(const DataTypePtr & type, const char * source_for_exception_message); +#endif + +#if USE_RAPIDJSON +template void jsonElementToString(const RapidJSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings); +template std::unique_ptr> buildJSONExtractTree(const DataTypePtr & type, const char * source_for_exception_message); +#else +template void jsonElementToString(const DummyJSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings); +template std::unique_ptr> buildJSONExtractTree(const DataTypePtr & type, const char * source_for_exception_message); +#endif + +} diff --git a/src/Formats/JSONExtractTree.h b/src/Formats/JSONExtractTree.h new file mode 100644 index 00000000000..4735f568b1c --- /dev/null +++ b/src/Formats/JSONExtractTree.h @@ -0,0 +1,35 @@ +#pragma once +#include +#include +#include + + +namespace DB +{ + +struct JSONExtractInsertSettings +{ + bool convert_bool_to_integer = true; + bool insert_default_on_invalid_elements_in_complex_types = false; +}; + +template +class JSONExtractTreeNode +{ +public: + JSONExtractTreeNode() = default; + virtual ~JSONExtractTreeNode() = default; + virtual bool insertResultToColumn(IColumn &, const typename JSONParser::Element &, const JSONExtractInsertSettings & insert_setting, const FormatSettings & format_settings, String & error) const = 0; +}; + +/// Build a tree for insertion JSON element into a column with provided data type. +template +std::unique_ptr> buildJSONExtractTree(const DataTypePtr & type, const char * source_for_exception_message); + +template +void jsonElementToString(const typename JSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings); + +template +bool tryGetNumericValueFromJSONElement(NumberType & value, const typename JSONParser::Element & element, bool convert_bool_to_integer, String & error); + +} diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index 31faea2e13e..6519d54a8c5 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -225,19 +225,6 @@ namespace Paths paths; }; - bool checkIfTypesAreEqual(const DataTypes & types) - { - if (types.empty()) - return true; - - for (size_t i = 1; i < types.size(); ++i) - { - if (!types[0]->equals(*types[i])) - return false; - } - return true; - } - void updateTypeIndexes(DataTypes & data_types, TypeIndexesSet & type_indexes) { type_indexes.clear(); @@ -272,24 +259,31 @@ namespace type_indexes.erase(TypeIndex::Nothing); } - /// If we have both Int64 and UInt64, convert all Int64 to UInt64, + /// If we have both Int64 and UInt64, convert all not-negative Int64 to UInt64, /// because UInt64 is inferred only in case of Int64 overflow. - void transformIntegers(DataTypes & data_types, TypeIndexesSet & type_indexes) + void transformIntegers(DataTypes & data_types, TypeIndexesSet & type_indexes, JSONInferenceInfo * json_info) { if (!type_indexes.contains(TypeIndex::Int64) || !type_indexes.contains(TypeIndex::UInt64)) return; + bool have_negative_integers = false; for (auto & type : data_types) { if (WhichDataType(type).isInt64()) - type = std::make_shared(); + { + bool is_negative = json_info->negative_integers.contains(type.get()); + have_negative_integers |= is_negative; + if (!is_negative) + type = std::make_shared(); + } } - type_indexes.erase(TypeIndex::Int64); + if (!have_negative_integers) + type_indexes.erase(TypeIndex::Int64); } /// If we have both Int64 and Float64 types, convert all Int64 to Float64. - void transformIntegersAndFloatsToFloats(DataTypes & data_types, TypeIndexesSet & type_indexes) + void transformIntegersAndFloatsToFloats(DataTypes & data_types, TypeIndexesSet & type_indexes, JSONInferenceInfo * json_info) { bool have_floats = type_indexes.contains(TypeIndex::Float64); bool have_integers = type_indexes.contains(TypeIndex::Int64) || type_indexes.contains(TypeIndex::UInt64); @@ -300,7 +294,12 @@ namespace { WhichDataType which(type); if (which.isInt64() || which.isUInt64()) - type = std::make_shared(); + { + auto new_type = std::make_shared(); + if (json_info->numbers_parsed_from_json_strings.erase(type.get())) + json_info->numbers_parsed_from_json_strings.insert(new_type.get()); + type = new_type; + } } type_indexes.erase(TypeIndex::Int64); @@ -635,9 +634,9 @@ namespace if (settings.try_infer_integers) { /// Transform Int64 to UInt64 if needed. - transformIntegers(data_types, type_indexes); + transformIntegers(data_types, type_indexes, json_info); /// Transform integers to floats if needed. - transformIntegersAndFloatsToFloats(data_types, type_indexes); + transformIntegersAndFloatsToFloats(data_types, type_indexes, json_info); } /// Transform Date to DateTime or both to String if needed. @@ -887,7 +886,7 @@ namespace } template - DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings) + DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info) { if (buf.eof()) return nullptr; @@ -911,7 +910,12 @@ namespace Int64 tmp_int; buf.position() = number_start; if (tryReadIntText(tmp_int, buf)) - return std::make_shared(); + { + auto type = std::make_shared(); + if (json_info && tmp_int < 0) + json_info->negative_integers.insert(type.get()); + return type; + } /// In case of Int64 overflow we can try to infer UInt64. UInt64 tmp_uint; @@ -934,7 +938,12 @@ namespace Int64 tmp_int; if (tryReadIntText(tmp_int, peekable_buf)) - return std::make_shared(); + { + auto type = std::make_shared(); + if (json_info && tmp_int < 0) + json_info->negative_integers.insert(type.get()); + return type; + } peekable_buf.rollbackToCheckpoint(/* drop= */ true); /// In case of Int64 overflow we can try to infer UInt64. @@ -952,7 +961,7 @@ namespace } template - DataTypePtr tryInferNumberFromStringImpl(std::string_view field, const FormatSettings & settings) + DataTypePtr tryInferNumberFromStringImpl(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_inference_info = nullptr) { ReadBufferFromString buf(field); @@ -960,7 +969,12 @@ namespace { Int64 tmp_int; if (tryReadIntText(tmp_int, buf) && buf.eof()) - return std::make_shared(); + { + auto type = std::make_shared(); + if (json_inference_info && tmp_int < 0) + json_inference_info->negative_integers.insert(type.get()); + return type; + } /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof. buf.position() = buf.buffer().begin(); @@ -1011,7 +1025,7 @@ namespace { if (settings.json.try_infer_numbers_from_strings) { - if (auto number_type = tryInferNumberFromStringImpl(field, settings)) + if (auto number_type = tryInferNumberFromStringImpl(field, settings, json_info)) { json_info->numbers_parsed_from_json_strings.insert(number_type.get()); return number_type; @@ -1254,10 +1268,23 @@ namespace } /// Number - return tryInferNumber(buf, settings); + return tryInferNumber(buf, settings, json_info); } } +bool checkIfTypesAreEqual(const DataTypes & types) +{ + if (types.empty()) + return true; + + for (size_t i = 1; i < types.size(); ++i) + { + if (!types[0]->equals(*types[i])) + return false; + } + return true; +} + void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings) { DataTypes types = {first, second}; @@ -1275,6 +1302,11 @@ void transformInferredJSONTypesIfNeeded( second = std::move(types[1]); } +void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info) +{ + transformInferredTypesIfNeededImpl(types, settings, json_info); +} + void transformInferredJSONTypesFromDifferentFilesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings) { JSONInferenceInfo json_info; @@ -1396,6 +1428,12 @@ DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSetting return tryInferNumberFromStringImpl(field, settings); } +DataTypePtr tryInferJSONNumberFromString(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_info) +{ + return tryInferNumberFromStringImpl(field, settings, json_info); + +} + DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings) { if (settings.try_infer_dates && tryInferDate(field)) diff --git a/src/Formats/SchemaInferenceUtils.h b/src/Formats/SchemaInferenceUtils.h index bcf3d194825..06c14c0797a 100644 --- a/src/Formats/SchemaInferenceUtils.h +++ b/src/Formats/SchemaInferenceUtils.h @@ -2,6 +2,7 @@ #include #include +#include #include @@ -18,6 +19,11 @@ struct JSONInferenceInfo /// We store numbers that were parsed from strings. /// It's used in types transformation to change such numbers back to string if needed. std::unordered_set numbers_parsed_from_json_strings; + /// Store integer types that were inferred from negative numbers. + /// It's used to determine common type for Int64 and UInt64 + /// TODO: check it not only in JSON formats. + std::unordered_set negative_integers; + /// Indicates if currently we are inferring type for Map/Object key. bool is_object_key = false; /// When we transform types for the same column from different files @@ -48,6 +54,7 @@ DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const Forma /// Try to parse a number value from a string. By default, it tries to parse Float64, /// but if setting try_infer_integers is enabled, it also tries to parse Int64. DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings); +DataTypePtr tryInferJSONNumberFromString(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_info); /// It takes two types inferred for the same column and tries to transform them to a common type if possible. /// It's also used when we try to infer some not ordinary types from another types. @@ -77,6 +84,7 @@ void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, c /// Example 2: /// We merge DataTypeJSONPaths types to a single DataTypeJSONPaths type with union of all JSON paths. void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, JSONInferenceInfo * json_info); +void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info); /// Make final transform for types inferred in JSON format. It does 3 types of transformation: /// 1) Checks if type is unnamed Tuple(...), tries to transform nested types to find a common type for them and if all nested types @@ -107,4 +115,6 @@ NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header); /// Check if type contains Nothing, like Array(Tuple(Nullable(Nothing), String)) bool checkIfTypeIsComplete(const DataTypePtr & type); +bool checkIfTypesAreEqual(const DataTypes & types); + } diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index fbd987577e9..c6af0674db7 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -1,10 +1,1069 @@ -#include +#include +#include + +#include + +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include #include +#include +#include +#include +#include + +#include + +#include "config.h" namespace DB { +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +template +concept HasIndexOperator = requires (T t) +{ + t[0]; +}; + +/// Functions to parse JSONs and extract values from it. +/// The first argument of all these functions gets a JSON, +/// after that there are any number of arguments specifying path to a desired part from the JSON's root. +/// For example, +/// select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 + +class FunctionJSONHelpers +{ +public: + template typename Impl, class JSONParser> + class Executor + { + public: + static ColumnPtr run(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count, const FormatSettings & format_settings) + { + MutableColumnPtr to{result_type->createColumn()}; + to->reserve(input_rows_count); + + if (arguments.empty()) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least one argument", String(Name::name)); + + const auto & first_column = arguments[0]; + if (!isString(first_column.type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The first argument of function {} should be a string containing JSON, illegal type: " + "{}", String(Name::name), first_column.type->getName()); + + const ColumnPtr & arg_json = first_column.column; + const auto * col_json_const = typeid_cast(arg_json.get()); + const auto * col_json_string + = typeid_cast(col_json_const ? col_json_const->getDataColumnPtr().get() : arg_json.get()); + + if (!col_json_string) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}", arg_json->getName()); + + const ColumnString::Chars & chars = col_json_string->getChars(); + const ColumnString::Offsets & offsets = col_json_string->getOffsets(); + + size_t num_index_arguments = Impl::getNumberOfIndexArguments(arguments); + std::vector moves = prepareMoves(Name::name, arguments, 1, num_index_arguments); + + /// Preallocate memory in parser if necessary. + JSONParser parser; + if constexpr (has_member_function_reserve::value) + { + size_t max_size = calculateMaxSize(offsets); + if (max_size) + parser.reserve(max_size); + } + + Impl impl; + + /// prepare() does Impl-specific preparation before handling each row. + if constexpr (has_member_function_prepare::*)(const char *, const ColumnsWithTypeAndName &, const DataTypePtr &)>::value) + impl.prepare(Name::name, arguments, result_type); + + using Element = typename JSONParser::Element; + + Element document; + bool document_ok = false; + if (col_json_const) + { + std::string_view json{reinterpret_cast(chars.data()), offsets[0] - 1}; + document_ok = parser.parse(json, document); + } + + String error; + for (const auto i : collections::range(0, input_rows_count)) + { + if (!col_json_const) + { + std::string_view json{reinterpret_cast(&chars[offsets[i - 1]]), offsets[i] - offsets[i - 1] - 1}; + document_ok = parser.parse(json, document); + } + + bool added_to_column = false; + if (document_ok) + { + /// Perform moves. + Element element; + std::string_view last_key; + bool moves_ok = performMoves(arguments, i, document, moves, element, last_key); + + if (moves_ok) + added_to_column = impl.insertResultToColumn(*to, element, last_key, format_settings, error); + } + + /// We add default value (=null or zero) if something goes wrong, we don't throw exceptions in these JSON functions. + if (!added_to_column) + to->insertDefault(); + } + return to; + } + }; + +private: + BOOST_TTI_HAS_MEMBER_FUNCTION(reserve) + BOOST_TTI_HAS_MEMBER_FUNCTION(prepare) + + /// Represents a move of a JSON iterator described by a single argument passed to a JSON function. + /// For example, the call JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) + /// contains two moves: {MoveType::ConstKey, "b"} and {MoveType::ConstIndex, 1}. + /// Keys and indices can be nonconst, in this case they are calculated for each row. + enum class MoveType : uint8_t + { + Key, + Index, + ConstKey, + ConstIndex, + }; + + struct Move + { + explicit Move(MoveType type_, size_t index_ = 0) : type(type_), index(index_) {} + Move(MoveType type_, const String & key_) : type(type_), key(key_) {} + MoveType type; + size_t index = 0; + String key; + }; + + static std::vector prepareMoves( + const char * function_name, + const ColumnsWithTypeAndName & columns, + size_t first_index_argument, + size_t num_index_arguments) + { + std::vector moves; + moves.reserve(num_index_arguments); + for (const auto i : collections::range(first_index_argument, first_index_argument + num_index_arguments)) + { + const auto & column = columns[i]; + if (!isString(column.type) && !isNativeInteger(column.type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The argument {} of function {} should be a string specifying key " + "or an integer specifying index, illegal type: {}", + std::to_string(i + 1), String(function_name), column.type->getName()); + + if (column.column && isColumnConst(*column.column)) + { + const auto & column_const = assert_cast(*column.column); + if (isString(column.type)) + moves.emplace_back(MoveType::ConstKey, column_const.getValue()); + else + moves.emplace_back(MoveType::ConstIndex, column_const.getInt(0)); + } + else + { + if (isString(column.type)) + moves.emplace_back(MoveType::Key, ""); + else + moves.emplace_back(MoveType::Index, 0); + } + } + return moves; + } + + + /// Performs moves of types MoveType::Index and MoveType::ConstIndex. + template + static bool performMoves(const ColumnsWithTypeAndName & arguments, size_t row, + const typename JSONParser::Element & document, const std::vector & moves, + typename JSONParser::Element & element, std::string_view & last_key) + { + typename JSONParser::Element res_element = document; + std::string_view key; + + for (size_t j = 0; j != moves.size(); ++j) + { + switch (moves[j].type) + { + case MoveType::ConstIndex: + { + if (!moveToElementByIndex(res_element, static_cast(moves[j].index), key)) + return false; + break; + } + case MoveType::ConstKey: + { + key = moves[j].key; + if (!moveToElementByKey(res_element, key)) + return false; + break; + } + case MoveType::Index: + { + Int64 index = (*arguments[j + 1].column)[row].get(); + if (!moveToElementByIndex(res_element, static_cast(index), key)) + return false; + break; + } + case MoveType::Key: + { + key = arguments[j + 1].column->getDataAt(row).toView(); + if (!moveToElementByKey(res_element, key)) + return false; + break; + } + } + } + + element = res_element; + last_key = key; + return true; + } + + template + static bool moveToElementByIndex(typename JSONParser::Element & element, int index, std::string_view & out_key) + { + if (element.isArray()) + { + auto array = element.getArray(); + if (index >= 0) + --index; + else + index += array.size(); + + if (static_cast(index) >= array.size()) + return false; + element = array[index]; + out_key = {}; + return true; + } + + if constexpr (HasIndexOperator) + { + if (element.isObject()) + { + auto object = element.getObject(); + if (index >= 0) + --index; + else + index += object.size(); + + if (static_cast(index) >= object.size()) + return false; + std::tie(out_key, element) = object[index]; + return true; + } + } + + return {}; + } + + /// Performs moves of types MoveType::Key and MoveType::ConstKey. + template + static bool moveToElementByKey(typename JSONParser::Element & element, std::string_view key) + { + if (!element.isObject()) + return false; + auto object = element.getObject(); + return object.find(key, element); + } + + static size_t calculateMaxSize(const ColumnString::Offsets & offsets) + { + size_t max_size = 0; + for (const auto i : collections::range(0, offsets.size())) + { + size_t size = offsets[i] - offsets[i - 1]; + max_size = std::max(max_size, size); + } + if (max_size) + --max_size; + return max_size; + } + +}; + +template +class JSONExtractImpl; + +template +class JSONExtractKeysAndValuesImpl; + +/** +* Functions JSONExtract and JSONExtractKeysAndValues force the return type - it is specified in the last argument. +* For example - `SELECT JSONExtract(materialize('{"a": 131231, "b": 1234}'), 'b', 'LowCardinality(FixedString(4))')` +* But by default ClickHouse decides on its own whether the return type will be LowCardinality based on the types of +* input arguments. +* And for these specific functions we cannot rely on this mechanism, so these functions have their own implementation - +* just convert all of the LowCardinality input columns to full ones, execute and wrap the resulting column in LowCardinality +* if needed. +*/ +template typename Impl> +constexpr bool functionForcesTheReturnType() +{ + return std::is_same_v, JSONExtractImpl> || std::is_same_v, JSONExtractKeysAndValuesImpl>; +} + +template typename Impl> +class ExecutableFunctionJSON : public IExecutableFunction +{ + +public: + explicit ExecutableFunctionJSON(const NullPresence & null_presence_, bool allow_simdjson_, const DataTypePtr & json_return_type_, const FormatSettings & format_settings_) + : null_presence(null_presence_), allow_simdjson(allow_simdjson_), json_return_type(json_return_type_), format_settings(format_settings_) + { + format_settings.json.escape_forward_slashes = false; + format_settings.null_as_default = false; + } + + String getName() const override { return Name::name; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForConstants() const override { return true; } + bool useDefaultImplementationForLowCardinalityColumns() const override + { + return !functionForcesTheReturnType(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + if (null_presence.has_null_constant) + return result_type->createColumnConstWithDefaultValue(input_rows_count); + + if constexpr (functionForcesTheReturnType()) + { + ColumnsWithTypeAndName columns_without_low_cardinality = arguments; + + for (auto & column : columns_without_low_cardinality) + { + column.column = recursiveRemoveLowCardinality(column.column); + column.type = recursiveRemoveLowCardinality(column.type); + } + + ColumnsWithTypeAndName temporary_columns = null_presence.has_nullable ? createBlockWithNestedColumns(columns_without_low_cardinality) : columns_without_low_cardinality; + ColumnPtr temporary_result = chooseAndRunJSONParser(temporary_columns, json_return_type, input_rows_count); + + if (null_presence.has_nullable) + temporary_result = wrapInNullable(temporary_result, columns_without_low_cardinality, result_type, input_rows_count); + + if (result_type->lowCardinality()) + temporary_result = recursiveLowCardinalityTypeConversion(temporary_result, json_return_type, result_type); + + return temporary_result; + } + else + { + ColumnsWithTypeAndName temporary_columns = null_presence.has_nullable ? createBlockWithNestedColumns(arguments) : arguments; + ColumnPtr temporary_result = chooseAndRunJSONParser(temporary_columns, json_return_type, input_rows_count); + + if (null_presence.has_nullable) + temporary_result = wrapInNullable(temporary_result, arguments, result_type, input_rows_count); + + if (result_type->lowCardinality()) + temporary_result = recursiveLowCardinalityTypeConversion(temporary_result, json_return_type, result_type); + + return temporary_result; + } + } + +private: + + ColumnPtr + chooseAndRunJSONParser(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const + { +#if USE_SIMDJSON + if (allow_simdjson) + return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count, format_settings); +#endif + +#if USE_RAPIDJSON + return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count, format_settings); +#else + return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count, format_settings); +#endif + } + + NullPresence null_presence; + bool allow_simdjson; + DataTypePtr json_return_type; + FormatSettings format_settings; +}; + + +template typename Impl> +class FunctionBaseFunctionJSON : public IFunctionBase +{ +public: + explicit FunctionBaseFunctionJSON( + const NullPresence & null_presence_, + bool allow_simdjson_, + DataTypes argument_types_, + DataTypePtr return_type_, + DataTypePtr json_return_type_, + const FormatSettings & format_settings_) + : null_presence(null_presence_) + , allow_simdjson(allow_simdjson_) + , argument_types(std::move(argument_types_)) + , return_type(std::move(return_type_)) + , json_return_type(std::move(json_return_type_)) + , format_settings(format_settings_) + { + } + + String getName() const override { return Name::name; } + + const DataTypes & getArgumentTypes() const override + { + return argument_types; + } + + const DataTypePtr & getResultType() const override + { + return return_type; + } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName &) const override + { + return std::make_unique>(null_presence, allow_simdjson, json_return_type, format_settings); + } + +private: + NullPresence null_presence; + bool allow_simdjson; + DataTypes argument_types; + DataTypePtr return_type; + DataTypePtr json_return_type; + FormatSettings format_settings; +}; + +/// We use IFunctionOverloadResolver instead of IFunction to handle non-default NULL processing. +/// Both NULL and JSON NULL should generate NULL value. If any argument is NULL, return NULL. +template typename Impl> +class JSONOverloadResolver : public IFunctionOverloadResolver, WithContext +{ +public: + static constexpr auto name = Name::name; + + String getName() const override { return name; } + + static FunctionOverloadResolverPtr create(ContextPtr context_) + { + return std::make_unique(context_); + } + + explicit JSONOverloadResolver(ContextPtr context_) : WithContext(context_) {} + + bool isVariadic() const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override + { + return !functionForcesTheReturnType(); + } + + FunctionBasePtr build(const ColumnsWithTypeAndName & arguments) const override + { + bool has_nothing_argument = false; + for (const auto & arg : arguments) + has_nothing_argument |= isNothing(arg.type); + + DataTypePtr json_return_type = Impl::getReturnType(Name::name, createBlockWithNestedColumns(arguments)); + NullPresence null_presence = getNullPresense(arguments); + DataTypePtr return_type; + if (has_nothing_argument) + return_type = std::make_shared(); + else if (null_presence.has_null_constant) + return_type = makeNullable(std::make_shared()); + else if (null_presence.has_nullable) + return_type = makeNullable(json_return_type); + else + return_type = json_return_type; + + DataTypes argument_types; + argument_types.reserve(arguments.size()); + for (const auto & argument : arguments) + argument_types.emplace_back(argument.type); + return std::make_unique>( + null_presence, getContext()->getSettingsRef().allow_simdjson, argument_types, return_type, json_return_type, getFormatSettings(getContext())); + } +}; + +struct NameJSONHas { static constexpr auto name{"JSONHas"}; }; +struct NameIsValidJSON { static constexpr auto name{"isValidJSON"}; }; +struct NameJSONLength { static constexpr auto name{"JSONLength"}; }; +struct NameJSONKey { static constexpr auto name{"JSONKey"}; }; +struct NameJSONType { static constexpr auto name{"JSONType"}; }; +struct NameJSONExtractInt { static constexpr auto name{"JSONExtractInt"}; }; +struct NameJSONExtractUInt { static constexpr auto name{"JSONExtractUInt"}; }; +struct NameJSONExtractFloat { static constexpr auto name{"JSONExtractFloat"}; }; +struct NameJSONExtractBool { static constexpr auto name{"JSONExtractBool"}; }; +struct NameJSONExtractString { static constexpr auto name{"JSONExtractString"}; }; +struct NameJSONExtract { static constexpr auto name{"JSONExtract"}; }; +struct NameJSONExtractKeysAndValues { static constexpr auto name{"JSONExtractKeysAndValues"}; }; +struct NameJSONExtractRaw { static constexpr auto name{"JSONExtractRaw"}; }; +struct NameJSONExtractArrayRaw { static constexpr auto name{"JSONExtractArrayRaw"}; }; +struct NameJSONExtractKeysAndValuesRaw { static constexpr auto name{"JSONExtractKeysAndValuesRaw"}; }; +struct NameJSONExtractKeys { static constexpr auto name{"JSONExtractKeys"}; }; + + +template +class JSONHasImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) { return std::make_shared(); } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view, const FormatSettings &, String &) + { + ColumnVector & col_vec = assert_cast &>(dest); + col_vec.insertValue(1); + return true; + } +}; + + +template +class IsValidJSONImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) + { + if (arguments.size() != 1) + { + /// IsValidJSON() shouldn't get parameters other than JSON. + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} needs exactly one argument", + String(function_name)); + } + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName &) { return 0; } + + static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view, const FormatSettings &, String &) + { + /// This function is called only if JSON is valid. + /// If JSON isn't valid then `FunctionJSON::Executor::run()` adds default value (=zero) to `dest` without calling this function. + ColumnVector & col_vec = assert_cast &>(dest); + col_vec.insertValue(1); + return true; + } +}; + + +template +class JSONLengthImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings &, String &) + { + size_t size; + if (element.isArray()) + size = element.getArray().size(); + else if (element.isObject()) + size = element.getObject().size(); + else + return false; + + ColumnVector & col_vec = assert_cast &>(dest); + col_vec.insertValue(size); + return true; + } +}; + + +template +class JSONKeyImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view last_key, const FormatSettings &, String &) + { + if (last_key.empty()) + return false; + ColumnString & col_str = assert_cast(dest); + col_str.insertData(last_key.data(), last_key.size()); + return true; + } +}; + + +template +class JSONTypeImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + static const std::vector> values = { + {"Array", '['}, + {"Object", '{'}, + {"String", '"'}, + {"Int64", 'i'}, + {"UInt64", 'u'}, + {"Double", 'd'}, + {"Bool", 'b'}, + {"Null", 0}, /// the default value for the column. + }; + return std::make_shared>(values); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings &, String &) + { + UInt8 type; + switch (element.type()) + { + case ElementType::INT64: + type = 'i'; + break; + case ElementType::UINT64: + type = 'u'; + break; + case ElementType::DOUBLE: + type = 'd'; + break; + case ElementType::STRING: + type = '"'; + break; + case ElementType::ARRAY: + type = '['; + break; + case ElementType::OBJECT: + type = '{'; + break; + case ElementType::BOOL: + type = 'b'; + break; + case ElementType::NULL_VALUE: + type = 0; + break; + } + + ColumnVector & col_vec = assert_cast &>(dest); + col_vec.insertValue(type); + return true; + } +}; + + +template +class JSONExtractNumericImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared>(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static const std::unique_ptr> & getInsertNode() + { + static const std::unique_ptr> node = buildJSONExtractTree(std::make_shared>()); + } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings &, String & error) + { + NumberType value; + + tryGetNumericValueFromJSONElement(value, element, convert_bool_to_integer, error); + + if (dest.getDataType() == TypeIndex::LowCardinality) + { + ColumnLowCardinality & col_low = assert_cast(dest); + col_low.insertData(reinterpret_cast(&value), sizeof(value)); + } + else + { + auto & col_vec = assert_cast &>(dest); + col_vec.insertValue(value); + } + return true; + } +}; + + +template +using JSONExtractInt64Impl = JSONExtractNumericImpl; +template +using JSONExtractUInt64Impl = JSONExtractNumericImpl; +template +using JSONExtractFloat64Impl = JSONExtractNumericImpl; + + +template +class JSONExtractBoolImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings &, String &) + { + bool value; + switch (element.type()) + { + case ElementType::BOOL: + value = element.getBool(); + break; + case ElementType::INT64: + value = element.getInt64() != 0; + break; + case ElementType::UINT64: + value = element.getUInt64() != 0; + break; + default: + return false; + } + + auto & col_vec = assert_cast &>(dest); + col_vec.insertValue(static_cast(value)); + return true; + } +}; + +template +class JSONExtractRawImpl; + +template +class JSONExtractStringImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings & format_settings, String & error) + { + if (element.isNull()) + return false; + + if (!element.isString()) + return JSONExtractRawImpl::insertResultToColumn(dest, element, {}, format_settings, error); + + auto str = element.getString(); + ColumnString & col_str = assert_cast(dest); + col_str.insertData(str.data(), str.size()); + return true; + } +}; + +template +class JSONExtractImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) + { + if (arguments.size() < 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); + + const auto & col = arguments.back(); + const auto * col_type_const = typeid_cast(col.column.get()); + if (!col_type_const || !isString(col.type)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "The last argument of function {} should " + "be a constant string specifying the return data type, illegal value: {}", + String(function_name), col.name); + + return DataTypeFactory::instance().get(col_type_const->getValue()); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } + + void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) + { + extract_tree = buildJSONExtractTree(result_type, function_name); + insert_settings.insert_default_on_invalid_elements_in_complex_types = true; + } + + bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings & format_settings, String & error) + { + return extract_tree->insertResultToColumn(dest, element, insert_settings, format_settings, error); + } + +protected: + std::unique_ptr> extract_tree; + JSONExtractInsertSettings insert_settings; +}; + + +template +class JSONExtractKeysAndValuesImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) + { + if (arguments.size() < 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); + + const auto & col = arguments.back(); + const auto * col_type_const = typeid_cast(col.column.get()); + if (!col_type_const || !isString(col.type)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "The last argument of function {} should " + "be a constant string specifying the values' data type, illegal value: {}", + String(function_name), col.name); + + DataTypePtr key_type = std::make_unique(); + DataTypePtr value_type = DataTypeFactory::instance().get(col_type_const->getValue()); + DataTypePtr tuple_type = std::make_unique(DataTypes{key_type, value_type}); + return std::make_unique(tuple_type); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } + + void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) + { + const auto tuple_type = typeid_cast(result_type.get())->getNestedType(); + const auto value_type = typeid_cast(tuple_type.get())->getElements()[1]; + extract_tree = buildJSONExtractTree(value_type, function_name); + insert_settings.insert_default_on_invalid_elements_in_complex_types = true; + } + + bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings & format_settings, String & error) + { + if (!element.isObject()) + return false; + + auto object = element.getObject(); + + auto & col_arr = assert_cast(dest); + auto & col_tuple = assert_cast(col_arr.getData()); + size_t old_size = col_tuple.size(); + auto & col_key = assert_cast(col_tuple.getColumn(0)); + auto & col_value = col_tuple.getColumn(1); + + for (const auto & [key, value] : object) + { + if (extract_tree->insertResultToColumn(col_value, value, insert_settings, format_settings, error)) + col_key.insertData(key.data(), key.size()); + } + + if (col_tuple.size() == old_size) + return false; + + col_arr.getOffsets().push_back(col_tuple.size()); + return true; + } + +private: + std::unique_ptr> extract_tree; + JSONExtractInsertSettings insert_settings; +}; + + +template +class JSONExtractRawImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings & format_settings, String &) + { + ColumnString & col_str = assert_cast(dest); + auto & chars = col_str.getChars(); + WriteBufferFromVector buf(chars, AppendModeTag()); + jsonElementToString(element, buf, format_settings); + buf.finalize(); + chars.push_back(0); + col_str.getOffsets().push_back(chars.size()); + return true; + } +}; + + +template +class JSONExtractArrayRawImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_shared(std::make_shared()); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings & format_settings, String & error) + { + if (!element.isArray()) + return false; + + auto array = element.getArray(); + ColumnArray & col_res = assert_cast(dest); + + for (auto value : array) + JSONExtractRawImpl::insertResultToColumn(col_res.getData(), value, {}, format_settings, error); + + col_res.getOffsets().push_back(col_res.getOffsets().back() + array.size()); + return true; + } +}; + + +template +class JSONExtractKeysAndValuesRawImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + DataTypePtr string_type = std::make_unique(); + DataTypePtr tuple_type = std::make_unique(DataTypes{string_type, string_type}); + return std::make_unique(tuple_type); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings & format_settings, String & error) + { + if (!element.isObject()) + return false; + + auto object = element.getObject(); + + auto & col_arr = assert_cast(dest); + auto & col_tuple = assert_cast(col_arr.getData()); + auto & col_key = assert_cast(col_tuple.getColumn(0)); + auto & col_value = assert_cast(col_tuple.getColumn(1)); + + for (const auto & [key, value] : object) + { + col_key.insertData(key.data(), key.size()); + JSONExtractRawImpl::insertResultToColumn(col_value, value, {}, format_settings, error); + } + + col_arr.getOffsets().push_back(col_arr.getOffsets().back() + object.size()); + return true; + } +}; + +template +class JSONExtractKeysImpl +{ +public: + using Element = typename JSONParser::Element; + + static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) + { + return std::make_unique(std::make_shared()); + } + + static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } + + bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings &, String &) + { + if (!element.isObject()) + return false; + + auto object = element.getObject(); + + ColumnArray & col_res = assert_cast(dest); + auto & col_key = assert_cast(col_res.getData()); + + for (const auto & [key, value] : object) + { + col_key.insertData(key.data(), key.size()); + } + + col_res.getOffsets().push_back(col_res.getOffsets().back() + object.size()); + return true; + } +}; + REGISTER_FUNCTION(JSON) { factory.registerFunction>(); diff --git a/src/Functions/FunctionsJSON.h b/src/Functions/FunctionsJSON.h index 8a2ad457d34..5d44e22300d 100644 --- a/src/Functions/FunctionsJSON.h +++ b/src/Functions/FunctionsJSON.h @@ -1,1781 +1,1273 @@ -#pragma once - -#include -#include - -#include - -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include - - -#include "config.h" - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int ILLEGAL_COLUMN; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; -} - -template -concept HasIndexOperator = requires (T t) -{ - t[0]; -}; - -/// Functions to parse JSONs and extract values from it. -/// The first argument of all these functions gets a JSON, -/// after that there are any number of arguments specifying path to a desired part from the JSON's root. -/// For example, -/// select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 - -class FunctionJSONHelpers -{ -public: - template typename Impl, class JSONParser> - class Executor - { - public: - static ColumnPtr run(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) - { - MutableColumnPtr to{result_type->createColumn()}; - to->reserve(input_rows_count); - - if (arguments.empty()) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least one argument", String(Name::name)); - - const auto & first_column = arguments[0]; - if (!isString(first_column.type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "The first argument of function {} should be a string containing JSON, illegal type: " - "{}", String(Name::name), first_column.type->getName()); - - const ColumnPtr & arg_json = first_column.column; - const auto * col_json_const = typeid_cast(arg_json.get()); - const auto * col_json_string - = typeid_cast(col_json_const ? col_json_const->getDataColumnPtr().get() : arg_json.get()); - - if (!col_json_string) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}", arg_json->getName()); - - const ColumnString::Chars & chars = col_json_string->getChars(); - const ColumnString::Offsets & offsets = col_json_string->getOffsets(); - - size_t num_index_arguments = Impl::getNumberOfIndexArguments(arguments); - std::vector moves = prepareMoves(Name::name, arguments, 1, num_index_arguments); - - /// Preallocate memory in parser if necessary. - JSONParser parser; - if constexpr (has_member_function_reserve::value) - { - size_t max_size = calculateMaxSize(offsets); - if (max_size) - parser.reserve(max_size); - } - - Impl impl; - - /// prepare() does Impl-specific preparation before handling each row. - if constexpr (has_member_function_prepare::*)(const char *, const ColumnsWithTypeAndName &, const DataTypePtr &)>::value) - impl.prepare(Name::name, arguments, result_type); - - using Element = typename JSONParser::Element; - - Element document; - bool document_ok = false; - if (col_json_const) - { - std::string_view json{reinterpret_cast(chars.data()), offsets[0] - 1}; - document_ok = parser.parse(json, document); - } - - for (const auto i : collections::range(0, input_rows_count)) - { - if (!col_json_const) - { - std::string_view json{reinterpret_cast(&chars[offsets[i - 1]]), offsets[i] - offsets[i - 1] - 1}; - document_ok = parser.parse(json, document); - } - - bool added_to_column = false; - if (document_ok) - { - /// Perform moves. - Element element; - std::string_view last_key; - bool moves_ok = performMoves(arguments, i, document, moves, element, last_key); - - if (moves_ok) - added_to_column = impl.insertResultToColumn(*to, element, last_key); - } - - /// We add default value (=null or zero) if something goes wrong, we don't throw exceptions in these JSON functions. - if (!added_to_column) - to->insertDefault(); - } - return to; - } - }; - -private: - BOOST_TTI_HAS_MEMBER_FUNCTION(reserve) - BOOST_TTI_HAS_MEMBER_FUNCTION(prepare) - - /// Represents a move of a JSON iterator described by a single argument passed to a JSON function. - /// For example, the call JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) - /// contains two moves: {MoveType::ConstKey, "b"} and {MoveType::ConstIndex, 1}. - /// Keys and indices can be nonconst, in this case they are calculated for each row. - enum class MoveType : uint8_t - { - Key, - Index, - ConstKey, - ConstIndex, - }; - - struct Move - { - explicit Move(MoveType type_, size_t index_ = 0) : type(type_), index(index_) {} - Move(MoveType type_, const String & key_) : type(type_), key(key_) {} - MoveType type; - size_t index = 0; - String key; - }; - - static std::vector prepareMoves( - const char * function_name, - const ColumnsWithTypeAndName & columns, - size_t first_index_argument, - size_t num_index_arguments) - { - std::vector moves; - moves.reserve(num_index_arguments); - for (const auto i : collections::range(first_index_argument, first_index_argument + num_index_arguments)) - { - const auto & column = columns[i]; - if (!isString(column.type) && !isNativeInteger(column.type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "The argument {} of function {} should be a string specifying key " - "or an integer specifying index, illegal type: {}", - std::to_string(i + 1), String(function_name), column.type->getName()); - - if (column.column && isColumnConst(*column.column)) - { - const auto & column_const = assert_cast(*column.column); - if (isString(column.type)) - moves.emplace_back(MoveType::ConstKey, column_const.getValue()); - else - moves.emplace_back(MoveType::ConstIndex, column_const.getInt(0)); - } - else - { - if (isString(column.type)) - moves.emplace_back(MoveType::Key, ""); - else - moves.emplace_back(MoveType::Index, 0); - } - } - return moves; - } - - - /// Performs moves of types MoveType::Index and MoveType::ConstIndex. - template - static bool performMoves(const ColumnsWithTypeAndName & arguments, size_t row, - const typename JSONParser::Element & document, const std::vector & moves, - typename JSONParser::Element & element, std::string_view & last_key) - { - typename JSONParser::Element res_element = document; - std::string_view key; - - for (size_t j = 0; j != moves.size(); ++j) - { - switch (moves[j].type) - { - case MoveType::ConstIndex: - { - if (!moveToElementByIndex(res_element, static_cast(moves[j].index), key)) - return false; - break; - } - case MoveType::ConstKey: - { - key = moves[j].key; - if (!moveToElementByKey(res_element, key)) - return false; - break; - } - case MoveType::Index: - { - Int64 index = (*arguments[j + 1].column)[row].get(); - if (!moveToElementByIndex(res_element, static_cast(index), key)) - return false; - break; - } - case MoveType::Key: - { - key = arguments[j + 1].column->getDataAt(row).toView(); - if (!moveToElementByKey(res_element, key)) - return false; - break; - } - } - } - - element = res_element; - last_key = key; - return true; - } - - template - static bool moveToElementByIndex(typename JSONParser::Element & element, int index, std::string_view & out_key) - { - if (element.isArray()) - { - auto array = element.getArray(); - if (index >= 0) - --index; - else - index += array.size(); - - if (static_cast(index) >= array.size()) - return false; - element = array[index]; - out_key = {}; - return true; - } - - if constexpr (HasIndexOperator) - { - if (element.isObject()) - { - auto object = element.getObject(); - if (index >= 0) - --index; - else - index += object.size(); - - if (static_cast(index) >= object.size()) - return false; - std::tie(out_key, element) = object[index]; - return true; - } - } - - return {}; - } - - /// Performs moves of types MoveType::Key and MoveType::ConstKey. - template - static bool moveToElementByKey(typename JSONParser::Element & element, std::string_view key) - { - if (!element.isObject()) - return false; - auto object = element.getObject(); - return object.find(key, element); - } - - static size_t calculateMaxSize(const ColumnString::Offsets & offsets) - { - size_t max_size = 0; - for (const auto i : collections::range(0, offsets.size())) - { - size_t size = offsets[i] - offsets[i - 1]; - max_size = std::max(max_size, size); - } - if (max_size) - --max_size; - return max_size; - } - -}; - -template -class JSONExtractImpl; - -template -class JSONExtractKeysAndValuesImpl; - -/** -* Functions JSONExtract and JSONExtractKeysAndValues force the return type - it is specified in the last argument. -* For example - `SELECT JSONExtract(materialize('{"a": 131231, "b": 1234}'), 'b', 'LowCardinality(FixedString(4))')` -* But by default ClickHouse decides on its own whether the return type will be LowCardinality based on the types of -* input arguments. -* And for these specific functions we cannot rely on this mechanism, so these functions have their own implementation - -* just convert all of the LowCardinality input columns to full ones, execute and wrap the resulting column in LowCardinality -* if needed. -*/ -template typename Impl> -constexpr bool functionForcesTheReturnType() -{ - return std::is_same_v, JSONExtractImpl> || std::is_same_v, JSONExtractKeysAndValuesImpl>; -} - -template typename Impl> -class ExecutableFunctionJSON : public IExecutableFunction -{ - -public: - explicit ExecutableFunctionJSON(const NullPresence & null_presence_, bool allow_simdjson_, const DataTypePtr & json_return_type_) - : null_presence(null_presence_), allow_simdjson(allow_simdjson_), json_return_type(json_return_type_) - { - } - - String getName() const override { return Name::name; } - bool useDefaultImplementationForNulls() const override { return false; } - bool useDefaultImplementationForConstants() const override { return true; } - bool useDefaultImplementationForLowCardinalityColumns() const override - { - return !functionForcesTheReturnType(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override - { - if (null_presence.has_null_constant) - return result_type->createColumnConstWithDefaultValue(input_rows_count); - - if constexpr (functionForcesTheReturnType()) - { - ColumnsWithTypeAndName columns_without_low_cardinality = arguments; - - for (auto & column : columns_without_low_cardinality) - { - column.column = recursiveRemoveLowCardinality(column.column); - column.type = recursiveRemoveLowCardinality(column.type); - } - - ColumnsWithTypeAndName temporary_columns = null_presence.has_nullable ? createBlockWithNestedColumns(columns_without_low_cardinality) : columns_without_low_cardinality; - ColumnPtr temporary_result = chooseAndRunJSONParser(temporary_columns, json_return_type, input_rows_count); - - if (null_presence.has_nullable) - temporary_result = wrapInNullable(temporary_result, columns_without_low_cardinality, result_type, input_rows_count); - - if (result_type->lowCardinality()) - temporary_result = recursiveLowCardinalityTypeConversion(temporary_result, json_return_type, result_type); - - return temporary_result; - } - else - { - ColumnsWithTypeAndName temporary_columns = null_presence.has_nullable ? createBlockWithNestedColumns(arguments) : arguments; - ColumnPtr temporary_result = chooseAndRunJSONParser(temporary_columns, json_return_type, input_rows_count); - - if (null_presence.has_nullable) - temporary_result = wrapInNullable(temporary_result, arguments, result_type, input_rows_count); - - if (result_type->lowCardinality()) - temporary_result = recursiveLowCardinalityTypeConversion(temporary_result, json_return_type, result_type); - - return temporary_result; - } - } - -private: - - ColumnPtr - chooseAndRunJSONParser(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const - { -#if USE_SIMDJSON - if (allow_simdjson) - return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); -#endif - -#if USE_RAPIDJSON - return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); -#else - return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); -#endif - } - - NullPresence null_presence; - bool allow_simdjson; - DataTypePtr json_return_type; -}; - - -template typename Impl> -class FunctionBaseFunctionJSON : public IFunctionBase -{ -public: - explicit FunctionBaseFunctionJSON( - const NullPresence & null_presence_, - bool allow_simdjson_, - DataTypes argument_types_, - DataTypePtr return_type_, - DataTypePtr json_return_type_) - : null_presence(null_presence_) - , allow_simdjson(allow_simdjson_) - , argument_types(std::move(argument_types_)) - , return_type(std::move(return_type_)) - , json_return_type(std::move(json_return_type_)) - { - } - - String getName() const override { return Name::name; } - - const DataTypes & getArgumentTypes() const override - { - return argument_types; - } - - const DataTypePtr & getResultType() const override - { - return return_type; - } - - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - - ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName &) const override - { - return std::make_unique>(null_presence, allow_simdjson, json_return_type); - } - -private: - NullPresence null_presence; - bool allow_simdjson; - DataTypes argument_types; - DataTypePtr return_type; - DataTypePtr json_return_type; -}; - -/// We use IFunctionOverloadResolver instead of IFunction to handle non-default NULL processing. -/// Both NULL and JSON NULL should generate NULL value. If any argument is NULL, return NULL. -template typename Impl> -class JSONOverloadResolver : public IFunctionOverloadResolver, WithContext -{ -public: - static constexpr auto name = Name::name; - - String getName() const override { return name; } - - static FunctionOverloadResolverPtr create(ContextPtr context_) - { - return std::make_unique(context_); - } - - explicit JSONOverloadResolver(ContextPtr context_) : WithContext(context_) {} - - bool isVariadic() const override { return true; } - size_t getNumberOfArguments() const override { return 0; } - bool useDefaultImplementationForNulls() const override { return false; } - bool useDefaultImplementationForLowCardinalityColumns() const override - { - return !functionForcesTheReturnType(); - } - - FunctionBasePtr build(const ColumnsWithTypeAndName & arguments) const override - { - bool has_nothing_argument = false; - for (const auto & arg : arguments) - has_nothing_argument |= isNothing(arg.type); - - DataTypePtr json_return_type = Impl::getReturnType(Name::name, createBlockWithNestedColumns(arguments)); - NullPresence null_presence = getNullPresense(arguments); - DataTypePtr return_type; - if (has_nothing_argument) - return_type = std::make_shared(); - else if (null_presence.has_null_constant) - return_type = makeNullable(std::make_shared()); - else if (null_presence.has_nullable) - return_type = makeNullable(json_return_type); - else - return_type = json_return_type; - - /// Top-level LowCardinality columns are processed outside JSON parser. - json_return_type = removeLowCardinality(json_return_type); - - DataTypes argument_types; - argument_types.reserve(arguments.size()); - for (const auto & argument : arguments) - argument_types.emplace_back(argument.type); - return std::make_unique>( - null_presence, getContext()->getSettingsRef().allow_simdjson, argument_types, return_type, json_return_type); - } -}; - -struct NameJSONHas { static constexpr auto name{"JSONHas"}; }; -struct NameIsValidJSON { static constexpr auto name{"isValidJSON"}; }; -struct NameJSONLength { static constexpr auto name{"JSONLength"}; }; -struct NameJSONKey { static constexpr auto name{"JSONKey"}; }; -struct NameJSONType { static constexpr auto name{"JSONType"}; }; -struct NameJSONExtractInt { static constexpr auto name{"JSONExtractInt"}; }; -struct NameJSONExtractUInt { static constexpr auto name{"JSONExtractUInt"}; }; -struct NameJSONExtractFloat { static constexpr auto name{"JSONExtractFloat"}; }; -struct NameJSONExtractBool { static constexpr auto name{"JSONExtractBool"}; }; -struct NameJSONExtractString { static constexpr auto name{"JSONExtractString"}; }; -struct NameJSONExtract { static constexpr auto name{"JSONExtract"}; }; -struct NameJSONExtractKeysAndValues { static constexpr auto name{"JSONExtractKeysAndValues"}; }; -struct NameJSONExtractRaw { static constexpr auto name{"JSONExtractRaw"}; }; -struct NameJSONExtractArrayRaw { static constexpr auto name{"JSONExtractArrayRaw"}; }; -struct NameJSONExtractKeysAndValuesRaw { static constexpr auto name{"JSONExtractKeysAndValuesRaw"}; }; -struct NameJSONExtractKeys { static constexpr auto name{"JSONExtractKeys"}; }; - - -template -class JSONHasImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) { return std::make_shared(); } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view) - { - ColumnVector & col_vec = assert_cast &>(dest); - col_vec.insertValue(1); - return true; - } -}; - - -template -class IsValidJSONImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) - { - if (arguments.size() != 1) - { - /// IsValidJSON() shouldn't get parameters other than JSON. - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} needs exactly one argument", - String(function_name)); - } - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName &) { return 0; } - - static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view) - { - /// This function is called only if JSON is valid. - /// If JSON isn't valid then `FunctionJSON::Executor::run()` adds default value (=zero) to `dest` without calling this function. - ColumnVector & col_vec = assert_cast &>(dest); - col_vec.insertValue(1); - return true; - } -}; - - -template -class JSONLengthImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - size_t size; - if (element.isArray()) - size = element.getArray().size(); - else if (element.isObject()) - size = element.getObject().size(); - else - return false; - - ColumnVector & col_vec = assert_cast &>(dest); - col_vec.insertValue(size); - return true; - } -}; - - -template -class JSONKeyImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view last_key) - { - if (last_key.empty()) - return false; - ColumnString & col_str = assert_cast(dest); - col_str.insertData(last_key.data(), last_key.size()); - return true; - } -}; - - -template -class JSONTypeImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - static const std::vector> values = { - {"Array", '['}, - {"Object", '{'}, - {"String", '"'}, - {"Int64", 'i'}, - {"UInt64", 'u'}, - {"Double", 'd'}, - {"Bool", 'b'}, - {"Null", 0}, /// the default value for the column. - }; - return std::make_shared>(values); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - UInt8 type; - switch (element.type()) - { - case ElementType::INT64: - type = 'i'; - break; - case ElementType::UINT64: - type = 'u'; - break; - case ElementType::DOUBLE: - type = 'd'; - break; - case ElementType::STRING: - type = '"'; - break; - case ElementType::ARRAY: - type = '['; - break; - case ElementType::OBJECT: - type = '{'; - break; - case ElementType::BOOL: - type = 'b'; - break; - case ElementType::NULL_VALUE: - type = 0; - break; - } - - ColumnVector & col_vec = assert_cast &>(dest); - col_vec.insertValue(type); - return true; - } -}; - - -template -class JSONExtractNumericImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared>(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - NumberType value; - - switch (element.type()) - { - case ElementType::DOUBLE: - if constexpr (std::is_floating_point_v) - { - /// We permit inaccurate conversion of double to float. - /// Example: double 0.1 from JSON is not representable in float. - /// But it will be more convenient for user to perform conversion. - value = static_cast(element.getDouble()); - } - else if (!accurate::convertNumeric(element.getDouble(), value)) - return false; - break; - case ElementType::UINT64: - if (!accurate::convertNumeric(element.getUInt64(), value)) - return false; - break; - case ElementType::INT64: - if (!accurate::convertNumeric(element.getInt64(), value)) - return false; - break; - case ElementType::BOOL: - if constexpr (is_integer && convert_bool_to_integer) - { - value = static_cast(element.getBool()); - break; - } - return false; - case ElementType::STRING: - { - auto rb = ReadBufferFromMemory{element.getString()}; - if constexpr (std::is_floating_point_v) - { - if (!tryReadFloatText(value, rb) || !rb.eof()) - return false; - } - else - { - if (tryReadIntText(value, rb) && rb.eof()) - break; - - /// Try to parse float and convert it to integer. - Float64 tmp_float; - rb.position() = rb.buffer().begin(); - if (!tryReadFloatText(tmp_float, rb) || !rb.eof()) - return false; - - if (!accurate::convertNumeric(tmp_float, value)) - return false; - } - break; - } - default: - return false; - } - - if (dest.getDataType() == TypeIndex::LowCardinality) - { - ColumnLowCardinality & col_low = assert_cast(dest); - col_low.insertData(reinterpret_cast(&value), sizeof(value)); - } - else - { - auto & col_vec = assert_cast &>(dest); - col_vec.insertValue(value); - } - return true; - } -}; - - -template -using JSONExtractInt64Impl = JSONExtractNumericImpl; -template -using JSONExtractUInt64Impl = JSONExtractNumericImpl; -template -using JSONExtractFloat64Impl = JSONExtractNumericImpl; - - -template -class JSONExtractBoolImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - bool value; - switch (element.type()) - { - case ElementType::BOOL: - value = element.getBool(); - break; - case ElementType::INT64: - value = element.getInt64() != 0; - break; - case ElementType::UINT64: - value = element.getUInt64() != 0; - break; - default: - return false; - } - - auto & col_vec = assert_cast &>(dest); - col_vec.insertValue(static_cast(value)); - return true; - } -}; - -template -class JSONExtractRawImpl; - -template -class JSONExtractStringImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (element.isNull()) - return false; - - if (!element.isString()) - return JSONExtractRawImpl::insertResultToColumn(dest, element, {}); - - auto str = element.getString(); - - if (dest.getDataType() == TypeIndex::LowCardinality) - { - ColumnLowCardinality & col_low = assert_cast(dest); - col_low.insertData(str.data(), str.size()); - } - else - { - ColumnString & col_str = assert_cast(dest); - col_str.insertData(str.data(), str.size()); - } - return true; - } -}; - -/// Nodes of the extract tree. We need the extract tree to extract from JSON complex values containing array, tuples or nullables. -template -struct JSONExtractTree -{ - using Element = typename JSONParser::Element; - - class Node - { - public: - Node() = default; - virtual ~Node() = default; - virtual bool insertResultToColumn(IColumn &, const Element &) = 0; - }; - - template - class NumericNode : public Node - { - public: - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - return JSONExtractNumericImpl::insertResultToColumn(dest, element, {}); - } - }; - - class LowCardinalityFixedStringNode : public Node - { - public: - explicit LowCardinalityFixedStringNode(const size_t fixed_length_) : fixed_length(fixed_length_) { } - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - // If element is an object we delegate the insertion to JSONExtractRawImpl - if (element.isObject()) - return JSONExtractRawImpl::insertResultToLowCardinalityFixedStringColumn(dest, element, fixed_length); - else if (!element.isString()) - return false; - - auto str = element.getString(); - if (str.size() > fixed_length) - return false; - - // For the non low cardinality case of FixedString, the padding is done in the FixedString Column implementation. - // In order to avoid having to pass the data to a FixedString Column and read it back (which would slow down the execution) - // the data is padded here and written directly to the Low Cardinality Column - if (str.size() == fixed_length) - { - assert_cast(dest).insertData(str.data(), str.size()); - } - else - { - String padded_str(str); - padded_str.resize(fixed_length, '\0'); - - assert_cast(dest).insertData(padded_str.data(), padded_str.size()); - } - return true; - } - - private: - const size_t fixed_length; - }; - - class UUIDNode : public Node - { - public: - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - if (!element.isString()) - return false; - - auto uuid = parseFromString(element.getString()); - if (dest.getDataType() == TypeIndex::LowCardinality) - { - ColumnLowCardinality & col_low = assert_cast(dest); - col_low.insertData(reinterpret_cast(&uuid), sizeof(uuid)); - } - else - { - assert_cast(dest).insert(uuid); - } - return true; - } - }; - - template - class DecimalNode : public Node - { - public: - explicit DecimalNode(DataTypePtr data_type_) : data_type(data_type_) {} - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - const auto * type = assert_cast *>(data_type.get()); - - DecimalType value{}; - - switch (element.type()) - { - case ElementType::DOUBLE: - value = convertToDecimal, DataTypeDecimal>( - element.getDouble(), type->getScale()); - break; - case ElementType::UINT64: - value = convertToDecimal, DataTypeDecimal>( - element.getUInt64(), type->getScale()); - break; - case ElementType::INT64: - value = convertToDecimal, DataTypeDecimal>( - element.getInt64(), type->getScale()); - break; - case ElementType::STRING: { - auto rb = ReadBufferFromMemory{element.getString()}; - if (!SerializationDecimal::tryReadText(value, rb, DecimalUtils::max_precision, type->getScale())) - return false; - break; - } - default: - return false; - } - - assert_cast &>(dest).insertValue(value); - return true; - } - - private: - DataTypePtr data_type; - }; - - class StringNode : public Node - { - public: - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - return JSONExtractStringImpl::insertResultToColumn(dest, element, {}); - } - }; - - class FixedStringNode : public Node - { - public: - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - if (element.isNull()) - return false; - - if (!element.isString()) - return JSONExtractRawImpl::insertResultToFixedStringColumn(dest, element, {}); - - auto str = element.getString(); - auto & col_str = assert_cast(dest); - if (str.size() > col_str.getN()) - return false; - col_str.insertData(str.data(), str.size()); - - return true; - } - }; - - template - class EnumNode : public Node - { - public: - explicit EnumNode(const std::vector> & name_value_pairs_) : name_value_pairs(name_value_pairs_) - { - for (const auto & name_value_pair : name_value_pairs) - { - name_to_value_map.emplace(name_value_pair.first, name_value_pair.second); - only_values.emplace(name_value_pair.second); - } - } - - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - auto & col_vec = assert_cast &>(dest); - - if (element.isInt64()) - { - Type value; - if (!accurate::convertNumeric(element.getInt64(), value) || !only_values.contains(value)) - return false; - col_vec.insertValue(value); - return true; - } - - if (element.isUInt64()) - { - Type value; - if (!accurate::convertNumeric(element.getUInt64(), value) || !only_values.contains(value)) - return false; - col_vec.insertValue(value); - return true; - } - - if (element.isString()) - { - auto value = name_to_value_map.find(element.getString()); - if (value == name_to_value_map.end()) - return false; - col_vec.insertValue(value->second); - return true; - } - - return false; - } - - private: - std::vector> name_value_pairs; - std::unordered_map name_to_value_map; - std::unordered_set only_values; - }; - - class NullableNode : public Node - { - public: - explicit NullableNode(std::unique_ptr nested_) : nested(std::move(nested_)) {} - - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - if (dest.getDataType() == TypeIndex::LowCardinality) - { - /// We do not need to handle nullability in that case - /// because nested node handles LowCardinality columns and will call proper overload of `insertData` - return nested->insertResultToColumn(dest, element); - } - - ColumnNullable & col_null = assert_cast(dest); - if (!nested->insertResultToColumn(col_null.getNestedColumn(), element)) - return false; - col_null.getNullMapColumn().insertValue(0); - return true; - } - - private: - std::unique_ptr nested; - }; - - class ArrayNode : public Node - { - public: - explicit ArrayNode(std::unique_ptr nested_) : nested(std::move(nested_)) {} - - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - if (!element.isArray()) - return false; - - auto array = element.getArray(); - - ColumnArray & col_arr = assert_cast(dest); - auto & data = col_arr.getData(); - size_t old_size = data.size(); - bool were_valid_elements = false; - - for (auto value : array) - { - if (nested->insertResultToColumn(data, value)) - were_valid_elements = true; - else - data.insertDefault(); - } - - if (!were_valid_elements) - { - data.popBack(data.size() - old_size); - return false; - } - - col_arr.getOffsets().push_back(data.size()); - return true; - } - - private: - std::unique_ptr nested; - }; - - class TupleNode : public Node - { - public: - TupleNode(std::vector> nested_, const std::vector & explicit_names_) : nested(std::move(nested_)), explicit_names(explicit_names_) - { - for (size_t i = 0; i != explicit_names.size(); ++i) - name_to_index_map.emplace(explicit_names[i], i); - } - - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - ColumnTuple & tuple = assert_cast(dest); - size_t old_size = dest.size(); - bool were_valid_elements = false; - - auto set_size = [&](size_t size) - { - for (size_t i = 0; i != tuple.tupleSize(); ++i) - { - auto & col = tuple.getColumn(i); - if (col.size() != size) - { - if (col.size() > size) - col.popBack(col.size() - size); - else - while (col.size() < size) - col.insertDefault(); - } - } - }; - - if (element.isArray()) - { - auto array = element.getArray(); - auto it = array.begin(); - - for (size_t index = 0; (index != nested.size()) && (it != array.end()); ++index) - { - if (nested[index]->insertResultToColumn(tuple.getColumn(index), *it++)) - were_valid_elements = true; - else - tuple.getColumn(index).insertDefault(); - } - - set_size(old_size + static_cast(were_valid_elements)); - return were_valid_elements; - } - - if (element.isObject()) - { - auto object = element.getObject(); - if (name_to_index_map.empty()) - { - auto it = object.begin(); - for (size_t index = 0; (index != nested.size()) && (it != object.end()); ++index) - { - if (nested[index]->insertResultToColumn(tuple.getColumn(index), (*it++).second)) - were_valid_elements = true; - else - tuple.getColumn(index).insertDefault(); - } - } - else - { - for (const auto & [key, value] : object) - { - auto index = name_to_index_map.find(key); - if (index != name_to_index_map.end()) - { - if (nested[index->second]->insertResultToColumn(tuple.getColumn(index->second), value)) - were_valid_elements = true; - } - } - } - - set_size(old_size + static_cast(were_valid_elements)); - return were_valid_elements; - } - - return false; - } - - private: - std::vector> nested; - std::vector explicit_names; - std::unordered_map name_to_index_map; - }; - - class MapNode : public Node - { - public: - MapNode(std::unique_ptr key_, std::unique_ptr value_) : key(std::move(key_)), value(std::move(value_)) { } - - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - if (!element.isObject()) - return false; - - ColumnMap & map_col = assert_cast(dest); - auto & offsets = map_col.getNestedColumn().getOffsets(); - auto & tuple_col = map_col.getNestedData(); - auto & key_col = tuple_col.getColumn(0); - auto & value_col = tuple_col.getColumn(1); - size_t old_size = tuple_col.size(); - - auto object = element.getObject(); - auto it = object.begin(); - for (; it != object.end(); ++it) - { - auto pair = *it; - - /// Insert key - key_col.insertData(pair.first.data(), pair.first.size()); - - /// Insert value - if (!value->insertResultToColumn(value_col, pair.second)) - value_col.insertDefault(); - } - - offsets.push_back(old_size + object.size()); - return true; - } - - private: - std::unique_ptr key; - std::unique_ptr value; - }; - - class VariantNode : public Node - { - public: - VariantNode(std::vector> variant_nodes_, std::vector order_) : variant_nodes(std::move(variant_nodes_)), order(std::move(order_)) { } - - bool insertResultToColumn(IColumn & dest, const Element & element) override - { - auto & column_variant = assert_cast(dest); - for (size_t i : order) - { - auto & variant = column_variant.getVariantByGlobalDiscriminator(i); - if (variant_nodes[i]->insertResultToColumn(variant, element)) - { - column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(i)); - column_variant.getOffsets().push_back(variant.size() - 1); - return true; - } - } - - return false; - } - - private: - std::vector> variant_nodes; - /// Order in which we should try variants nodes. - /// For example, String should be always the last one. - std::vector order; - }; - - static std::unique_ptr build(const char * function_name, const DataTypePtr & type) - { - switch (type->getTypeId()) - { - case TypeIndex::UInt8: return std::make_unique>(); - case TypeIndex::UInt16: return std::make_unique>(); - case TypeIndex::UInt32: return std::make_unique>(); - case TypeIndex::UInt64: return std::make_unique>(); - case TypeIndex::UInt128: return std::make_unique>(); - case TypeIndex::UInt256: return std::make_unique>(); - case TypeIndex::Int8: return std::make_unique>(); - case TypeIndex::Int16: return std::make_unique>(); - case TypeIndex::Int32: return std::make_unique>(); - case TypeIndex::Int64: return std::make_unique>(); - case TypeIndex::Int128: return std::make_unique>(); - case TypeIndex::Int256: return std::make_unique>(); - case TypeIndex::Float32: return std::make_unique>(); - case TypeIndex::Float64: return std::make_unique>(); - case TypeIndex::String: return std::make_unique(); - case TypeIndex::FixedString: return std::make_unique(); - case TypeIndex::UUID: return std::make_unique(); - case TypeIndex::LowCardinality: - { - // The low cardinality case is treated in two different ways: - // For FixedString type, an especial class is implemented for inserting the data in the destination column, - // as the string length must be passed in order to check and pad the incoming data. - // For the rest of low cardinality types, the insertion is done in their corresponding class, adapting the data - // as needed for the insertData function of the ColumnLowCardinality. - auto dictionary_type = typeid_cast(type.get())->getDictionaryType(); - if ((*dictionary_type).getTypeId() == TypeIndex::FixedString) - { - auto fixed_length = typeid_cast(dictionary_type.get())->getN(); - return std::make_unique(fixed_length); - } - return build(function_name, dictionary_type); - } - case TypeIndex::Decimal256: return std::make_unique>(type); - case TypeIndex::Decimal128: return std::make_unique>(type); - case TypeIndex::Decimal64: return std::make_unique>(type); - case TypeIndex::Decimal32: return std::make_unique>(type); - case TypeIndex::Enum8: - return std::make_unique>(static_cast(*type).getValues()); - case TypeIndex::Enum16: - return std::make_unique>(static_cast(*type).getValues()); - case TypeIndex::Nullable: - { - return std::make_unique(build(function_name, static_cast(*type).getNestedType())); - } - case TypeIndex::Array: - { - return std::make_unique(build(function_name, static_cast(*type).getNestedType())); - } - case TypeIndex::Tuple: - { - const auto & tuple = static_cast(*type); - const auto & tuple_elements = tuple.getElements(); - std::vector> elements; - elements.reserve(tuple_elements.size()); - for (const auto & tuple_element : tuple_elements) - elements.emplace_back(build(function_name, tuple_element)); - return std::make_unique(std::move(elements), tuple.haveExplicitNames() ? tuple.getElementNames() : Strings{}); - } - case TypeIndex::Map: - { - const auto & map_type = static_cast(*type); - const auto & key_type = map_type.getKeyType(); - if (!isString(removeLowCardinality(key_type))) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Function {} doesn't support the return type schema: {} with key type not String", - String(function_name), - type->getName()); - - const auto & value_type = map_type.getValueType(); - return std::make_unique(build(function_name, key_type), build(function_name, value_type)); - } - case TypeIndex::Variant: - { - const auto & variant_type = static_cast(*type); - const auto & variants = variant_type.getVariants(); - std::vector> variant_nodes; - variant_nodes.reserve(variants.size()); - for (const auto & variant : variants) - variant_nodes.push_back(build(function_name, variant)); - return std::make_unique(std::move(variant_nodes), SerializationVariant::getVariantsDeserializeTextOrder(variants)); - } - default: - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Function {} doesn't support the return type schema: {}", - String(function_name), type->getName()); - } - } -}; - - -template -class JSONExtractImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) - { - if (arguments.size() < 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); - - const auto & col = arguments.back(); - const auto * col_type_const = typeid_cast(col.column.get()); - if (!col_type_const || !isString(col.type)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "The last argument of function {} should " - "be a constant string specifying the return data type, illegal value: {}", - String(function_name), col.name); - - return DataTypeFactory::instance().get(col_type_const->getValue()); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } - - void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) - { - extract_tree = JSONExtractTree::build(function_name, result_type); - } - - bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - return extract_tree->insertResultToColumn(dest, element); - } - -protected: - std::unique_ptr::Node> extract_tree; -}; - - -template -class JSONExtractKeysAndValuesImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) - { - if (arguments.size() < 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); - - const auto & col = arguments.back(); - const auto * col_type_const = typeid_cast(col.column.get()); - if (!col_type_const || !isString(col.type)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "The last argument of function {} should " - "be a constant string specifying the values' data type, illegal value: {}", - String(function_name), col.name); - - DataTypePtr key_type = std::make_unique(); - DataTypePtr value_type = DataTypeFactory::instance().get(col_type_const->getValue()); - DataTypePtr tuple_type = std::make_unique(DataTypes{key_type, value_type}); - return std::make_unique(tuple_type); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } - - void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) - { - const auto tuple_type = typeid_cast(result_type.get())->getNestedType(); - const auto value_type = typeid_cast(tuple_type.get())->getElements()[1]; - extract_tree = JSONExtractTree::build(function_name, value_type); - } - - bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (!element.isObject()) - return false; - - auto object = element.getObject(); - - auto & col_arr = assert_cast(dest); - auto & col_tuple = assert_cast(col_arr.getData()); - size_t old_size = col_tuple.size(); - auto & col_key = assert_cast(col_tuple.getColumn(0)); - auto & col_value = col_tuple.getColumn(1); - - for (const auto & [key, value] : object) - { - if (extract_tree->insertResultToColumn(col_value, value)) - col_key.insertData(key.data(), key.size()); - } - - if (col_tuple.size() == old_size) - return false; - - col_arr.getOffsets().push_back(col_tuple.size()); - return true; - } - -private: - std::unique_ptr::Node> extract_tree; -}; - - -template -class JSONExtractRawImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (dest.getDataType() == TypeIndex::LowCardinality) - { - ColumnString::Chars chars; - WriteBufferFromVector buf(chars, AppendModeTag()); - traverse(element, buf); - buf.finalize(); - assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); - } - else - { - ColumnString & col_str = assert_cast(dest); - auto & chars = col_str.getChars(); - WriteBufferFromVector buf(chars, AppendModeTag()); - traverse(element, buf); - buf.finalize(); - chars.push_back(0); - col_str.getOffsets().push_back(chars.size()); - } - return true; - } - - // We use insertResultToFixedStringColumn in case we are inserting raw data in a FixedString column - static bool insertResultToFixedStringColumn(IColumn & dest, const Element & element, std::string_view) - { - ColumnFixedString::Chars chars; - WriteBufferFromVector buf(chars, AppendModeTag()); - traverse(element, buf); - buf.finalize(); - - auto & col_str = assert_cast(dest); - - if (chars.size() > col_str.getN()) - return false; - - chars.resize_fill(col_str.getN()); - col_str.insertData(reinterpret_cast(chars.data()), chars.size()); - - - return true; - } - - // We use insertResultToLowCardinalityFixedStringColumn in case we are inserting raw data in a Low Cardinality FixedString column - static bool insertResultToLowCardinalityFixedStringColumn(IColumn & dest, const Element & element, size_t fixed_length) - { - if (element.getObject().size() > fixed_length) - return false; - - ColumnFixedString::Chars chars; - WriteBufferFromVector buf(chars, AppendModeTag()); - traverse(element, buf); - buf.finalize(); - - if (chars.size() > fixed_length) - return false; - chars.resize_fill(fixed_length); - assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); - - return true; - } - -private: - static void traverse(const Element & element, WriteBuffer & buf) - { - if (element.isInt64()) - { - writeIntText(element.getInt64(), buf); - return; - } - if (element.isUInt64()) - { - writeIntText(element.getUInt64(), buf); - return; - } - if (element.isDouble()) - { - writeFloatText(element.getDouble(), buf); - return; - } - if (element.isBool()) - { - if (element.getBool()) - writeCString("true", buf); - else - writeCString("false", buf); - return; - } - if (element.isString()) - { - writeJSONString(element.getString(), buf, formatSettings()); - return; - } - if (element.isArray()) - { - writeChar('[', buf); - bool need_comma = false; - for (auto value : element.getArray()) - { - if (std::exchange(need_comma, true)) - writeChar(',', buf); - traverse(value, buf); - } - writeChar(']', buf); - return; - } - if (element.isObject()) - { - writeChar('{', buf); - bool need_comma = false; - for (auto [key, value] : element.getObject()) - { - if (std::exchange(need_comma, true)) - writeChar(',', buf); - writeJSONString(key, buf, formatSettings()); - writeChar(':', buf); - traverse(value, buf); - } - writeChar('}', buf); - return; - } - if (element.isNull()) - { - writeCString("null", buf); - return; - } - } - - static const FormatSettings & formatSettings() - { - static const FormatSettings the_instance = [] - { - FormatSettings settings; - settings.json.escape_forward_slashes = false; - return settings; - }(); - return the_instance; - } -}; - - -template -class JSONExtractArrayRawImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_shared(std::make_shared()); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (!element.isArray()) - return false; - - auto array = element.getArray(); - ColumnArray & col_res = assert_cast(dest); - - for (auto value : array) - JSONExtractRawImpl::insertResultToColumn(col_res.getData(), value, {}); - - col_res.getOffsets().push_back(col_res.getOffsets().back() + array.size()); - return true; - } -}; - - -template -class JSONExtractKeysAndValuesRawImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - DataTypePtr string_type = std::make_unique(); - DataTypePtr tuple_type = std::make_unique(DataTypes{string_type, string_type}); - return std::make_unique(tuple_type); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (!element.isObject()) - return false; - - auto object = element.getObject(); - - auto & col_arr = assert_cast(dest); - auto & col_tuple = assert_cast(col_arr.getData()); - auto & col_key = assert_cast(col_tuple.getColumn(0)); - auto & col_value = assert_cast(col_tuple.getColumn(1)); - - for (const auto & [key, value] : object) - { - col_key.insertData(key.data(), key.size()); - JSONExtractRawImpl::insertResultToColumn(col_value, value, {}); - } - - col_arr.getOffsets().push_back(col_arr.getOffsets().back() + object.size()); - return true; - } -}; - -template -class JSONExtractKeysImpl -{ -public: - using Element = typename JSONParser::Element; - - static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) - { - return std::make_unique(std::make_shared()); - } - - static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - - bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) - { - if (!element.isObject()) - return false; - - auto object = element.getObject(); - - ColumnArray & col_res = assert_cast(dest); - auto & col_key = assert_cast(col_res.getData()); - - for (const auto & [key, value] : object) - { - col_key.insertData(key.data(), key.size()); - } - - col_res.getOffsets().push_back(col_res.getOffsets().back() + object.size()); - return true; - } -}; - -} +//#pragma once +// +//#include +//#include +// +//#include +// +//#include +// +//#include +//#include +//#include +// +//#include +////#include +// +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include +//#include +//#include +//#include +//#include +// +//#include +//#include +// +// +//#include "config.h" +// +// +//namespace DB +//{ +// +//namespace ErrorCodes +//{ +// extern const int ILLEGAL_TYPE_OF_ARGUMENT; +// extern const int ILLEGAL_COLUMN; +// extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +//} +// +//template +//concept HasIndexOperator = requires (T t) +//{ +// t[0]; +//}; +// +///// Functions to parse JSONs and extract values from it. +///// The first argument of all these functions gets a JSON, +///// after that there are any number of arguments specifying path to a desired part from the JSON's root. +///// For example, +///// select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 +// +//class FunctionJSONHelpers +//{ +//public: +// template typename Impl, class JSONParser> +// class Executor +// { +// public: +// static ColumnPtr run(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) +// { +// MutableColumnPtr to{result_type->createColumn()}; +// to->reserve(input_rows_count); +// +// if (arguments.empty()) +// throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least one argument", String(Name::name)); +// +// const auto & first_column = arguments[0]; +// if (!isString(first_column.type)) +// throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, +// "The first argument of function {} should be a string containing JSON, illegal type: " +// "{}", String(Name::name), first_column.type->getName()); +// +// const ColumnPtr & arg_json = first_column.column; +// const auto * col_json_const = typeid_cast(arg_json.get()); +// const auto * col_json_string +// = typeid_cast(col_json_const ? col_json_const->getDataColumnPtr().get() : arg_json.get()); +// +// if (!col_json_string) +// throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}", arg_json->getName()); +// +// const ColumnString::Chars & chars = col_json_string->getChars(); +// const ColumnString::Offsets & offsets = col_json_string->getOffsets(); +// +// size_t num_index_arguments = Impl::getNumberOfIndexArguments(arguments); +// std::vector moves = prepareMoves(Name::name, arguments, 1, num_index_arguments); +// +// /// Preallocate memory in parser if necessary. +// JSONParser parser; +// if constexpr (has_member_function_reserve::value) +// { +// size_t max_size = calculateMaxSize(offsets); +// if (max_size) +// parser.reserve(max_size); +// } +// +// Impl impl; +// +// /// prepare() does Impl-specific preparation before handling each row. +// if constexpr (has_member_function_prepare::*)(const char *, const ColumnsWithTypeAndName &, const DataTypePtr &)>::value) +// impl.prepare(Name::name, arguments, result_type); +// +// using Element = typename JSONParser::Element; +// +// Element document; +// bool document_ok = false; +// if (col_json_const) +// { +// std::string_view json{reinterpret_cast(chars.data()), offsets[0] - 1}; +// document_ok = parser.parse(json, document); +// } +// +// for (const auto i : collections::range(0, input_rows_count)) +// { +// if (!col_json_const) +// { +// std::string_view json{reinterpret_cast(&chars[offsets[i - 1]]), offsets[i] - offsets[i - 1] - 1}; +// document_ok = parser.parse(json, document); +// } +// +// bool added_to_column = false; +// if (document_ok) +// { +// /// Perform moves. +// Element element; +// std::string_view last_key; +// bool moves_ok = performMoves(arguments, i, document, moves, element, last_key); +// +// if (moves_ok) +// added_to_column = impl.insertResultToColumn(*to, element, last_key); +// } +// +// /// We add default value (=null or zero) if something goes wrong, we don't throw exceptions in these JSON functions. +// if (!added_to_column) +// to->insertDefault(); +// } +// return to; +// } +// }; +// +//private: +// BOOST_TTI_HAS_MEMBER_FUNCTION(reserve) +// BOOST_TTI_HAS_MEMBER_FUNCTION(prepare) +// +// /// Represents a move of a JSON iterator described by a single argument passed to a JSON function. +// /// For example, the call JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) +// /// contains two moves: {MoveType::ConstKey, "b"} and {MoveType::ConstIndex, 1}. +// /// Keys and indices can be nonconst, in this case they are calculated for each row. +// enum class MoveType : uint8_t +// { +// Key, +// Index, +// ConstKey, +// ConstIndex, +// }; +// +// struct Move +// { +// explicit Move(MoveType type_, size_t index_ = 0) : type(type_), index(index_) {} +// Move(MoveType type_, const String & key_) : type(type_), key(key_) {} +// MoveType type; +// size_t index = 0; +// String key; +// }; +// +// static std::vector prepareMoves( +// const char * function_name, +// const ColumnsWithTypeAndName & columns, +// size_t first_index_argument, +// size_t num_index_arguments) +// { +// std::vector moves; +// moves.reserve(num_index_arguments); +// for (const auto i : collections::range(first_index_argument, first_index_argument + num_index_arguments)) +// { +// const auto & column = columns[i]; +// if (!isString(column.type) && !isNativeInteger(column.type)) +// throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, +// "The argument {} of function {} should be a string specifying key " +// "or an integer specifying index, illegal type: {}", +// std::to_string(i + 1), String(function_name), column.type->getName()); +// +// if (column.column && isColumnConst(*column.column)) +// { +// const auto & column_const = assert_cast(*column.column); +// if (isString(column.type)) +// moves.emplace_back(MoveType::ConstKey, column_const.getValue()); +// else +// moves.emplace_back(MoveType::ConstIndex, column_const.getInt(0)); +// } +// else +// { +// if (isString(column.type)) +// moves.emplace_back(MoveType::Key, ""); +// else +// moves.emplace_back(MoveType::Index, 0); +// } +// } +// return moves; +// } +// +// +// /// Performs moves of types MoveType::Index and MoveType::ConstIndex. +// template +// static bool performMoves(const ColumnsWithTypeAndName & arguments, size_t row, +// const typename JSONParser::Element & document, const std::vector & moves, +// typename JSONParser::Element & element, std::string_view & last_key) +// { +// typename JSONParser::Element res_element = document; +// std::string_view key; +// +// for (size_t j = 0; j != moves.size(); ++j) +// { +// switch (moves[j].type) +// { +// case MoveType::ConstIndex: +// { +// if (!moveToElementByIndex(res_element, static_cast(moves[j].index), key)) +// return false; +// break; +// } +// case MoveType::ConstKey: +// { +// key = moves[j].key; +// if (!moveToElementByKey(res_element, key)) +// return false; +// break; +// } +// case MoveType::Index: +// { +// Int64 index = (*arguments[j + 1].column)[row].get(); +// if (!moveToElementByIndex(res_element, static_cast(index), key)) +// return false; +// break; +// } +// case MoveType::Key: +// { +// key = arguments[j + 1].column->getDataAt(row).toView(); +// if (!moveToElementByKey(res_element, key)) +// return false; +// break; +// } +// } +// } +// +// element = res_element; +// last_key = key; +// return true; +// } +// +// template +// static bool moveToElementByIndex(typename JSONParser::Element & element, int index, std::string_view & out_key) +// { +// if (element.isArray()) +// { +// auto array = element.getArray(); +// if (index >= 0) +// --index; +// else +// index += array.size(); +// +// if (static_cast(index) >= array.size()) +// return false; +// element = array[index]; +// out_key = {}; +// return true; +// } +// +// if constexpr (HasIndexOperator) +// { +// if (element.isObject()) +// { +// auto object = element.getObject(); +// if (index >= 0) +// --index; +// else +// index += object.size(); +// +// if (static_cast(index) >= object.size()) +// return false; +// std::tie(out_key, element) = object[index]; +// return true; +// } +// } +// +// return {}; +// } +// +// /// Performs moves of types MoveType::Key and MoveType::ConstKey. +// template +// static bool moveToElementByKey(typename JSONParser::Element & element, std::string_view key) +// { +// if (!element.isObject()) +// return false; +// auto object = element.getObject(); +// return object.find(key, element); +// } +// +// static size_t calculateMaxSize(const ColumnString::Offsets & offsets) +// { +// size_t max_size = 0; +// for (const auto i : collections::range(0, offsets.size())) +// { +// size_t size = offsets[i] - offsets[i - 1]; +// max_size = std::max(max_size, size); +// } +// if (max_size) +// --max_size; +// return max_size; +// } +// +//}; +// +//template +//class JSONExtractImpl; +// +//template +//class JSONExtractKeysAndValuesImpl; +// +///** +//* Functions JSONExtract and JSONExtractKeysAndValues force the return type - it is specified in the last argument. +//* For example - `SELECT JSONExtract(materialize('{"a": 131231, "b": 1234}'), 'b', 'LowCardinality(FixedString(4))')` +//* But by default ClickHouse decides on its own whether the return type will be LowCardinality based on the types of +//* input arguments. +//* And for these specific functions we cannot rely on this mechanism, so these functions have their own implementation - +//* just convert all of the LowCardinality input columns to full ones, execute and wrap the resulting column in LowCardinality +//* if needed. +//*/ +//template typename Impl> +//constexpr bool functionForcesTheReturnType() +//{ +// return std::is_same_v, JSONExtractImpl> || std::is_same_v, JSONExtractKeysAndValuesImpl>; +//} +// +//template typename Impl> +//class ExecutableFunctionJSON : public IExecutableFunction +//{ +// +//public: +// explicit ExecutableFunctionJSON(const NullPresence & null_presence_, bool allow_simdjson_, const DataTypePtr & json_return_type_) +// : null_presence(null_presence_), allow_simdjson(allow_simdjson_), json_return_type(json_return_type_) +// { +// } +// +// String getName() const override { return Name::name; } +// bool useDefaultImplementationForNulls() const override { return false; } +// bool useDefaultImplementationForConstants() const override { return true; } +// bool useDefaultImplementationForLowCardinalityColumns() const override +// { +// return !functionForcesTheReturnType(); +// } +// +// ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override +// { +// if (null_presence.has_null_constant) +// return result_type->createColumnConstWithDefaultValue(input_rows_count); +// +// if constexpr (functionForcesTheReturnType()) +// { +// ColumnsWithTypeAndName columns_without_low_cardinality = arguments; +// +// for (auto & column : columns_without_low_cardinality) +// { +// column.column = recursiveRemoveLowCardinality(column.column); +// column.type = recursiveRemoveLowCardinality(column.type); +// } +// +// ColumnsWithTypeAndName temporary_columns = null_presence.has_nullable ? createBlockWithNestedColumns(columns_without_low_cardinality) : columns_without_low_cardinality; +// ColumnPtr temporary_result = chooseAndRunJSONParser(temporary_columns, json_return_type, input_rows_count); +// +// if (null_presence.has_nullable) +// temporary_result = wrapInNullable(temporary_result, columns_without_low_cardinality, result_type, input_rows_count); +// +// if (result_type->lowCardinality()) +// temporary_result = recursiveLowCardinalityTypeConversion(temporary_result, json_return_type, result_type); +// +// return temporary_result; +// } +// else +// { +// ColumnsWithTypeAndName temporary_columns = null_presence.has_nullable ? createBlockWithNestedColumns(arguments) : arguments; +// ColumnPtr temporary_result = chooseAndRunJSONParser(temporary_columns, json_return_type, input_rows_count); +// +// if (null_presence.has_nullable) +// temporary_result = wrapInNullable(temporary_result, arguments, result_type, input_rows_count); +// +// if (result_type->lowCardinality()) +// temporary_result = recursiveLowCardinalityTypeConversion(temporary_result, json_return_type, result_type); +// +// return temporary_result; +// } +// } +// +//private: +// +// ColumnPtr +// chooseAndRunJSONParser(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const +// { +//#if USE_SIMDJSON +// if (allow_simdjson) +// return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); +//#endif +// +//#if USE_RAPIDJSON +// return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); +//#else +// return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); +//#endif +// } +// +// NullPresence null_presence; +// bool allow_simdjson; +// DataTypePtr json_return_type; +//}; +// +// +//template typename Impl> +//class FunctionBaseFunctionJSON : public IFunctionBase +//{ +//public: +// explicit FunctionBaseFunctionJSON( +// const NullPresence & null_presence_, +// bool allow_simdjson_, +// DataTypes argument_types_, +// DataTypePtr return_type_, +// DataTypePtr json_return_type_) +// : null_presence(null_presence_) +// , allow_simdjson(allow_simdjson_) +// , argument_types(std::move(argument_types_)) +// , return_type(std::move(return_type_)) +// , json_return_type(std::move(json_return_type_)) +// { +// } +// +// String getName() const override { return Name::name; } +// +// const DataTypes & getArgumentTypes() const override +// { +// return argument_types; +// } +// +// const DataTypePtr & getResultType() const override +// { +// return return_type; +// } +// +// bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } +// +// ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName &) const override +// { +// return std::make_unique>(null_presence, allow_simdjson, json_return_type); +// } +// +//private: +// NullPresence null_presence; +// bool allow_simdjson; +// DataTypes argument_types; +// DataTypePtr return_type; +// DataTypePtr json_return_type; +//}; +// +///// We use IFunctionOverloadResolver instead of IFunction to handle non-default NULL processing. +///// Both NULL and JSON NULL should generate NULL value. If any argument is NULL, return NULL. +//template typename Impl> +//class JSONOverloadResolver : public IFunctionOverloadResolver, WithContext +//{ +//public: +// static constexpr auto name = Name::name; +// +// String getName() const override { return name; } +// +// static FunctionOverloadResolverPtr create(ContextPtr context_) +// { +// return std::make_unique(context_); +// } +// +// explicit JSONOverloadResolver(ContextPtr context_) : WithContext(context_) {} +// +// bool isVariadic() const override { return true; } +// size_t getNumberOfArguments() const override { return 0; } +// bool useDefaultImplementationForNulls() const override { return false; } +// bool useDefaultImplementationForLowCardinalityColumns() const override +// { +// return !functionForcesTheReturnType(); +// } +// +// FunctionBasePtr build(const ColumnsWithTypeAndName & arguments) const override +// { +// bool has_nothing_argument = false; +// for (const auto & arg : arguments) +// has_nothing_argument |= isNothing(arg.type); +// +// DataTypePtr json_return_type = Impl::getReturnType(Name::name, createBlockWithNestedColumns(arguments)); +// NullPresence null_presence = getNullPresense(arguments); +// DataTypePtr return_type; +// if (has_nothing_argument) +// return_type = std::make_shared(); +// else if (null_presence.has_null_constant) +// return_type = makeNullable(std::make_shared()); +// else if (null_presence.has_nullable) +// return_type = makeNullable(json_return_type); +// else +// return_type = json_return_type; +// +// /// Top-level LowCardinality columns are processed outside JSON parser. +// json_return_type = removeLowCardinality(json_return_type); +// +// DataTypes argument_types; +// argument_types.reserve(arguments.size()); +// for (const auto & argument : arguments) +// argument_types.emplace_back(argument.type); +// return std::make_unique>( +// null_presence, getContext()->getSettingsRef().allow_simdjson, argument_types, return_type, json_return_type); +// } +//}; +// +//struct NameJSONHas { static constexpr auto name{"JSONHas"}; }; +//struct NameIsValidJSON { static constexpr auto name{"isValidJSON"}; }; +//struct NameJSONLength { static constexpr auto name{"JSONLength"}; }; +//struct NameJSONKey { static constexpr auto name{"JSONKey"}; }; +//struct NameJSONType { static constexpr auto name{"JSONType"}; }; +//struct NameJSONExtractInt { static constexpr auto name{"JSONExtractInt"}; }; +//struct NameJSONExtractUInt { static constexpr auto name{"JSONExtractUInt"}; }; +//struct NameJSONExtractFloat { static constexpr auto name{"JSONExtractFloat"}; }; +//struct NameJSONExtractBool { static constexpr auto name{"JSONExtractBool"}; }; +//struct NameJSONExtractString { static constexpr auto name{"JSONExtractString"}; }; +//struct NameJSONExtract { static constexpr auto name{"JSONExtract"}; }; +//struct NameJSONExtractKeysAndValues { static constexpr auto name{"JSONExtractKeysAndValues"}; }; +//struct NameJSONExtractRaw { static constexpr auto name{"JSONExtractRaw"}; }; +//struct NameJSONExtractArrayRaw { static constexpr auto name{"JSONExtractArrayRaw"}; }; +//struct NameJSONExtractKeysAndValuesRaw { static constexpr auto name{"JSONExtractKeysAndValuesRaw"}; }; +//struct NameJSONExtractKeys { static constexpr auto name{"JSONExtractKeys"}; }; +// +// +//template +//class JSONHasImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) { return std::make_shared(); } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view) +// { +// ColumnVector & col_vec = assert_cast &>(dest); +// col_vec.insertValue(1); +// return true; +// } +//}; +// +// +//template +//class IsValidJSONImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) +// { +// if (arguments.size() != 1) +// { +// /// IsValidJSON() shouldn't get parameters other than JSON. +// throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} needs exactly one argument", +// String(function_name)); +// } +// return std::make_shared(); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName &) { return 0; } +// +// static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view) +// { +// /// This function is called only if JSON is valid. +// /// If JSON isn't valid then `FunctionJSON::Executor::run()` adds default value (=zero) to `dest` without calling this function. +// ColumnVector & col_vec = assert_cast &>(dest); +// col_vec.insertValue(1); +// return true; +// } +//}; +// +// +//template +//class JSONLengthImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) +// { +// return std::make_shared(); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// size_t size; +// if (element.isArray()) +// size = element.getArray().size(); +// else if (element.isObject()) +// size = element.getObject().size(); +// else +// return false; +// +// ColumnVector & col_vec = assert_cast &>(dest); +// col_vec.insertValue(size); +// return true; +// } +//}; +// +// +//template +//class JSONKeyImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) +// { +// return std::make_shared(); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view last_key) +// { +// if (last_key.empty()) +// return false; +// ColumnString & col_str = assert_cast(dest); +// col_str.insertData(last_key.data(), last_key.size()); +// return true; +// } +//}; +// +// +//template +//class JSONTypeImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) +// { +// static const std::vector> values = { +// {"Array", '['}, +// {"Object", '{'}, +// {"String", '"'}, +// {"Int64", 'i'}, +// {"UInt64", 'u'}, +// {"Double", 'd'}, +// {"Bool", 'b'}, +// {"Null", 0}, /// the default value for the column. +// }; +// return std::make_shared>(values); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// UInt8 type; +// switch (element.type()) +// { +// case ElementType::INT64: +// type = 'i'; +// break; +// case ElementType::UINT64: +// type = 'u'; +// break; +// case ElementType::DOUBLE: +// type = 'd'; +// break; +// case ElementType::STRING: +// type = '"'; +// break; +// case ElementType::ARRAY: +// type = '['; +// break; +// case ElementType::OBJECT: +// type = '{'; +// break; +// case ElementType::BOOL: +// type = 'b'; +// break; +// case ElementType::NULL_VALUE: +// type = 0; +// break; +// } +// +// ColumnVector & col_vec = assert_cast &>(dest); +// col_vec.insertValue(type); +// return true; +// } +//}; +// +// +//template +//class JSONExtractNumericImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) +// { +// return std::make_shared>(); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// NumberType value; +// +// switch (element.type()) +// { +// case ElementType::DOUBLE: +// if constexpr (std::is_floating_point_v) +// { +// /// We permit inaccurate conversion of double to float. +// /// Example: double 0.1 from JSON is not representable in float. +// /// But it will be more convenient for user to perform conversion. +// value = static_cast(element.getDouble()); +// } +// else if (!accurate::convertNumeric(element.getDouble(), value)) +// return false; +// break; +// case ElementType::UINT64: +// if (!accurate::convertNumeric(element.getUInt64(), value)) +// return false; +// break; +// case ElementType::INT64: +// if (!accurate::convertNumeric(element.getInt64(), value)) +// return false; +// break; +// case ElementType::BOOL: +// if constexpr (is_integer && convert_bool_to_integer) +// { +// value = static_cast(element.getBool()); +// break; +// } +// return false; +// case ElementType::STRING: +// { +// auto rb = ReadBufferFromMemory{element.getString()}; +// if constexpr (std::is_floating_point_v) +// { +// if (!tryReadFloatText(value, rb) || !rb.eof()) +// return false; +// } +// else +// { +// if (tryReadIntText(value, rb) && rb.eof()) +// break; +// +// /// Try to parse float and convert it to integer. +// Float64 tmp_float; +// rb.position() = rb.buffer().begin(); +// if (!tryReadFloatText(tmp_float, rb) || !rb.eof()) +// return false; +// +// if (!accurate::convertNumeric(tmp_float, value)) +// return false; +// } +// break; +// } +// default: +// return false; +// } +// +// if (dest.getDataType() == TypeIndex::LowCardinality) +// { +// ColumnLowCardinality & col_low = assert_cast(dest); +// col_low.insertData(reinterpret_cast(&value), sizeof(value)); +// } +// else +// { +// auto & col_vec = assert_cast &>(dest); +// col_vec.insertValue(value); +// } +// return true; +// } +//}; +// +// +//template +//using JSONExtractInt64Impl = JSONExtractNumericImpl; +//template +//using JSONExtractUInt64Impl = JSONExtractNumericImpl; +//template +//using JSONExtractFloat64Impl = JSONExtractNumericImpl; +// +// +//template +//class JSONExtractBoolImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) +// { +// return std::make_shared(); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// bool value; +// switch (element.type()) +// { +// case ElementType::BOOL: +// value = element.getBool(); +// break; +// case ElementType::INT64: +// value = element.getInt64() != 0; +// break; +// case ElementType::UINT64: +// value = element.getUInt64() != 0; +// break; +// default: +// return false; +// } +// +// auto & col_vec = assert_cast &>(dest); +// col_vec.insertValue(static_cast(value)); +// return true; +// } +//}; +// +//template +//class JSONExtractRawImpl; +// +//template +//class JSONExtractStringImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) +// { +// return std::make_shared(); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// if (element.isNull()) +// return false; +// +// if (!element.isString()) +// return JSONExtractRawImpl::insertResultToColumn(dest, element, {}); +// +// auto str = element.getString(); +// +// if (dest.getDataType() == TypeIndex::LowCardinality) +// { +// ColumnLowCardinality & col_low = assert_cast(dest); +// col_low.insertData(str.data(), str.size()); +// } +// else +// { +// ColumnString & col_str = assert_cast(dest); +// col_str.insertData(str.data(), str.size()); +// } +// return true; +// } +//}; +// +// +//static const JSONExtractInsertSettings & getJSONExtractInsertSettings() +//{ +// static const JSONExtractInsertSettings instance = [] +// { +// JSONExtractInsertSettings settings; +// settings.insert_null_as_default = false; +// settings.insert_default_on_invalid_elements_in_complex_types = true; +// return settings; +// }(); +// return instance; +//} +// +//template +//class JSONExtractImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) +// { +// if (arguments.size() < 2) +// throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); +// +// const auto & col = arguments.back(); +// const auto * col_type_const = typeid_cast(col.column.get()); +// if (!col_type_const || !isString(col.type)) +// throw Exception(ErrorCodes::ILLEGAL_COLUMN, +// "The last argument of function {} should " +// "be a constant string specifying the return data type, illegal value: {}", +// String(function_name), col.name); +// +// return DataTypeFactory::instance().get(col_type_const->getValue()); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } +// +// void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) +// { +// extract_tree = buildJSONExtractTree(result_type, function_name); +// } +// +// bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// String error; +// return extract_tree->insertResultToColumn(dest, element, getJSONExtractInsertSettings(), error); +// } +// +//protected: +// std::unique_ptr> extract_tree; +//}; +// +// +//template +//class JSONExtractKeysAndValuesImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) +// { +// if (arguments.size() < 2) +// throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); +// +// const auto & col = arguments.back(); +// const auto * col_type_const = typeid_cast(col.column.get()); +// if (!col_type_const || !isString(col.type)) +// throw Exception(ErrorCodes::ILLEGAL_COLUMN, +// "The last argument of function {} should " +// "be a constant string specifying the values' data type, illegal value: {}", +// String(function_name), col.name); +// +// DataTypePtr key_type = std::make_unique(); +// DataTypePtr value_type = DataTypeFactory::instance().get(col_type_const->getValue()); +// DataTypePtr tuple_type = std::make_unique(DataTypes{key_type, value_type}); +// return std::make_unique(tuple_type); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } +// +// void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) +// { +// const auto tuple_type = typeid_cast(result_type.get())->getNestedType(); +// const auto value_type = typeid_cast(tuple_type.get())->getElements()[1]; +// extract_tree = buildJSONExtractTree(value_type, function_name); +// } +// +// bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// if (!element.isObject()) +// return false; +// +// auto object = element.getObject(); +// +// auto & col_arr = assert_cast(dest); +// auto & col_tuple = assert_cast(col_arr.getData()); +// size_t old_size = col_tuple.size(); +// auto & col_key = assert_cast(col_tuple.getColumn(0)); +// auto & col_value = col_tuple.getColumn(1); +// +// String error; +// for (const auto & [key, value] : object) +// { +// if (extract_tree->insertResultToColumn(col_value, value, getJSONExtractInsertSettings(), error)) +// col_key.insertData(key.data(), key.size()); +// } +// +// if (col_tuple.size() == old_size) +// return false; +// +// col_arr.getOffsets().push_back(col_tuple.size()); +// return true; +// } +// +//private: +// std::unique_ptr> extract_tree; +//}; +// +// +//template +//class JSONExtractRawImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) +// { +// return std::make_shared(); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// if (dest.getDataType() == TypeIndex::LowCardinality) +// { +// ColumnString::Chars chars; +// WriteBufferFromVector buf(chars, AppendModeTag()); +// traverse(element, buf); +// buf.finalize(); +// assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); +// } +// else +// { +// ColumnString & col_str = assert_cast(dest); +// auto & chars = col_str.getChars(); +// WriteBufferFromVector buf(chars, AppendModeTag()); +// traverse(element, buf); +// buf.finalize(); +// chars.push_back(0); +// col_str.getOffsets().push_back(chars.size()); +// } +// return true; +// } +// +// // We use insertResultToFixedStringColumn in case we are inserting raw data in a FixedString column +// static bool insertResultToFixedStringColumn(IColumn & dest, const Element & element, std::string_view) +// { +// ColumnFixedString::Chars chars; +// WriteBufferFromVector buf(chars, AppendModeTag()); +// traverse(element, buf); +// buf.finalize(); +// +// auto & col_str = assert_cast(dest); +// +// if (chars.size() > col_str.getN()) +// return false; +// +// chars.resize_fill(col_str.getN()); +// col_str.insertData(reinterpret_cast(chars.data()), chars.size()); +// +// +// return true; +// } +// +// // We use insertResultToLowCardinalityFixedStringColumn in case we are inserting raw data in a Low Cardinality FixedString column +// static bool insertResultToLowCardinalityFixedStringColumn(IColumn & dest, const Element & element, size_t fixed_length) +// { +// if (element.getObject().size() > fixed_length) +// return false; +// +// ColumnFixedString::Chars chars; +// WriteBufferFromVector buf(chars, AppendModeTag()); +// traverse(element, buf); +// buf.finalize(); +// +// if (chars.size() > fixed_length) +// return false; +// chars.resize_fill(fixed_length); +// assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); +// +// return true; +// } +// +//private: +// static void traverse(const Element & element, WriteBuffer & buf) +// { +// if (element.isInt64()) +// { +// writeIntText(element.getInt64(), buf); +// return; +// } +// if (element.isUInt64()) +// { +// writeIntText(element.getUInt64(), buf); +// return; +// } +// if (element.isDouble()) +// { +// writeFloatText(element.getDouble(), buf); +// return; +// } +// if (element.isBool()) +// { +// if (element.getBool()) +// writeCString("true", buf); +// else +// writeCString("false", buf); +// return; +// } +// if (element.isString()) +// { +// writeJSONString(element.getString(), buf, formatSettings()); +// return; +// } +// if (element.isArray()) +// { +// writeChar('[', buf); +// bool need_comma = false; +// for (auto value : element.getArray()) +// { +// if (std::exchange(need_comma, true)) +// writeChar(',', buf); +// traverse(value, buf); +// } +// writeChar(']', buf); +// return; +// } +// if (element.isObject()) +// { +// writeChar('{', buf); +// bool need_comma = false; +// for (auto [key, value] : element.getObject()) +// { +// if (std::exchange(need_comma, true)) +// writeChar(',', buf); +// writeJSONString(key, buf, formatSettings()); +// writeChar(':', buf); +// traverse(value, buf); +// } +// writeChar('}', buf); +// return; +// } +// if (element.isNull()) +// { +// writeCString("null", buf); +// return; +// } +// } +// +// static const FormatSettings & formatSettings() +// { +// static const FormatSettings the_instance = [] +// { +// FormatSettings settings; +// settings.json.escape_forward_slashes = false; +// return settings; +// }(); +// return the_instance; +// } +//}; +// +// +//template +//class JSONExtractArrayRawImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) +// { +// return std::make_shared(std::make_shared()); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// if (!element.isArray()) +// return false; +// +// auto array = element.getArray(); +// ColumnArray & col_res = assert_cast(dest); +// +// for (auto value : array) +// JSONExtractRawImpl::insertResultToColumn(col_res.getData(), value, {}); +// +// col_res.getOffsets().push_back(col_res.getOffsets().back() + array.size()); +// return true; +// } +//}; +// +// +//template +//class JSONExtractKeysAndValuesRawImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) +// { +// DataTypePtr string_type = std::make_unique(); +// DataTypePtr tuple_type = std::make_unique(DataTypes{string_type, string_type}); +// return std::make_unique(tuple_type); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// if (!element.isObject()) +// return false; +// +// auto object = element.getObject(); +// +// auto & col_arr = assert_cast(dest); +// auto & col_tuple = assert_cast(col_arr.getData()); +// auto & col_key = assert_cast(col_tuple.getColumn(0)); +// auto & col_value = assert_cast(col_tuple.getColumn(1)); +// +// for (const auto & [key, value] : object) +// { +// col_key.insertData(key.data(), key.size()); +// JSONExtractRawImpl::insertResultToColumn(col_value, value, {}); +// } +// +// col_arr.getOffsets().push_back(col_arr.getOffsets().back() + object.size()); +// return true; +// } +//}; +// +//template +//class JSONExtractKeysImpl +//{ +//public: +// using Element = typename JSONParser::Element; +// +// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) +// { +// return std::make_unique(std::make_shared()); +// } +// +// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } +// +// bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) +// { +// if (!element.isObject()) +// return false; +// +// auto object = element.getObject(); +// +// ColumnArray & col_res = assert_cast(dest); +// auto & col_key = assert_cast(col_res.getData()); +// +// for (const auto & [key, value] : object) +// { +// col_key.insertData(key.data(), key.size()); +// } +// +// col_res.getOffsets().push_back(col_res.getOffsets().back() + object.size()); +// return true; +// } +//}; +// +//} diff --git a/tests/queries/0_stateless/03198_json_extract_more_types.reference b/tests/queries/0_stateless/03198_json_extract_more_types.reference new file mode 100644 index 00000000000..9a6580ff81b --- /dev/null +++ b/tests/queries/0_stateless/03198_json_extract_more_types.reference @@ -0,0 +1,21 @@ +2020-01-01 +2020-01-01 +2020-01-01 00:00:00 +2020-01-01 00:00:00.000000 +127.0.0.1 +2001:db8:85a3::8a2e:370:7334 +42 +42 +42 +42 +42 +42 +42 +42 +42 +42 +Hello +Hello +\0\0\0 +Hello\0\0\0\0\0 +5801c962-1182-458a-89f8-d077da5074f9 diff --git a/tests/queries/0_stateless/03198_json_extract_more_types.sql b/tests/queries/0_stateless/03198_json_extract_more_types.sql new file mode 100644 index 00000000000..28d24bbb271 --- /dev/null +++ b/tests/queries/0_stateless/03198_json_extract_more_types.sql @@ -0,0 +1,29 @@ +set allow_suspicious_low_cardinality_types=1; + +select JSONExtract('{"a" : "2020-01-01"}', 'a', 'Date'); +select JSONExtract('{"a" : "2020-01-01"}', 'a', 'Date32'); +select JSONExtract('{"a" : "2020-01-01 00:00:00"}', 'a', 'DateTime'); +select JSONExtract('{"a" : "2020-01-01 00:00:00.000000"}', 'a', 'DateTime64(6)'); +select JSONExtract('{"a" : "127.0.0.1"}', 'a', 'IPv4'); +select JSONExtract('{"a" : "2001:0db8:85a3:0000:0000:8a2e:0370:7334"}', 'a', 'IPv6'); + + +select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(UInt8)'); +select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Int8)'); +select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(UInt16)'); +select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Int16)'); +select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(UInt32)'); +select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Int32)'); +select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(UInt64)'); +select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Int64)'); + +select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Float32)'); +select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Float32)'); + +select JSONExtract('{"a" : "Hello"}', 'a', 'LowCardinality(String)'); +select JSONExtract('{"a" : "Hello"}', 'a', 'LowCardinality(FixedString(5))'); +select JSONExtract('{"a" : "Hello"}', 'a', 'LowCardinality(FixedString(3))'); +select JSONExtract('{"a" : "Hello"}', 'a', 'LowCardinality(FixedString(10))'); + +select JSONExtract('{"a" : "5801c962-1182-458a-89f8-d077da5074f9"}', 'a', 'LowCardinality(UUID)'); + diff --git a/tests/queries/0_stateless/03199_json_extract_dynamic.reference b/tests/queries/0_stateless/03199_json_extract_dynamic.reference new file mode 100644 index 00000000000..759b7763cd1 --- /dev/null +++ b/tests/queries/0_stateless/03199_json_extract_dynamic.reference @@ -0,0 +1,30 @@ +true Bool +42 Int64 +-42 Int64 +18446744073709551615 UInt64 +42.42 Float64 +42 Int64 +-42 Int64 +18446744073709551615 UInt64 +Hello String +2020-01-01 Date +2020-01-01 00:00:00.000000000 DateTime64(9) +[1,2,3] Array(Nullable(Int64)) +['str1','str2','str3'] Array(Nullable(String)) +[[[1],[2,3,4]],[[5,6],[7]]] Array(Array(Array(Nullable(Int64)))) +['2020-01-01 00:00:00.000000000','2020-01-01 00:00:00.000000000'] Array(Nullable(DateTime64(9))) +['2020-01-01','2020-01-01 date'] Array(Nullable(String)) +['2020-01-01','2020-01-01 00:00:00','str'] Array(Nullable(String)) +['2020-01-01','2020-01-01 00:00:00','42'] Array(Nullable(String)) +['str','42'] Array(Nullable(String)) +[42,42.42] Array(Nullable(Float64)) +[42,18446744073709552000,42.42] Array(Nullable(Float64)) +[42,42.42] Array(Nullable(Float64)) +[NULL,NULL] Array(Nullable(String)) +[NULL,42] Array(Nullable(Int64)) +[[NULL],[],[42]] Array(Array(Nullable(Int64))) +[[],[NULL,NULL],[1,NULL,3],[NULL,2,NULL]] Array(Array(Nullable(Int64))) +[[],[NULL,NULL],['1',NULL,'3'],[NULL,'2',NULL],['2020-01-01']] Array(Array(Nullable(String))) +('str',42,[42]) Tuple(Nullable(String), Nullable(Int64), Array(Nullable(Int64))) +[42,18446744073709551615] Array(Nullable(UInt64)) +(-42,18446744073709551615) Tuple(Nullable(Int64), Nullable(UInt64)) diff --git a/tests/queries/0_stateless/03199_json_extract_dynamic.sql b/tests/queries/0_stateless/03199_json_extract_dynamic.sql new file mode 100644 index 00000000000..286949f4d3e --- /dev/null +++ b/tests/queries/0_stateless/03199_json_extract_dynamic.sql @@ -0,0 +1,37 @@ +set input_format_json_try_infer_numbers_from_strings=1; + +select JSONExtract(materialize('{"d" : true}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : 42}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : -42}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : 18446744073709551615}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : 42.42}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : "42"}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : "-42"}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : "18446744073709551615"}'), 'd', 'Dynamic') as d, dynamicType(d); + +select JSONExtract(materialize('{"d" : "Hello"}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : "2020-01-01"}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : "2020-01-01 00:00:00.000"}'), 'd', 'Dynamic') as d, dynamicType(d); + +select JSONExtract(materialize('{"d" : [1, 2, 3]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : ["str1", "str2", "str3"]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : [[[1], [2, 3, 4]], [[5, 6], [7]]]}'), 'd', 'Dynamic') as d, dynamicType(d); + +select JSONExtract(materialize('{"d" : ["2020-01-01", "2020-01-01 00:00:00"]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : ["2020-01-01", "2020-01-01 date"]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : ["2020-01-01", "2020-01-01 00:00:00", "str"]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : ["2020-01-01", "2020-01-01 00:00:00", "42"]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : ["str", "42"]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : [42, 42.42]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : [42, 18446744073709551615, 42.42]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : [42, 42.42]}'), 'd', 'Dynamic') as d, dynamicType(d); + +select JSONExtract(materialize('{"d" : [null, null]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : [null, 42]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : [[null], [], [42]]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"a" : [[], [null, null], ["1", null, "3"], [null, "2", null]]}'), 'a', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"a" : [[], [null, null], ["1", null, "3"], [null, "2", null], ["2020-01-01"]]}'), 'a', 'Dynamic') as d, dynamicType(d); + +select JSONExtract(materialize('{"d" : ["str", 42, [42]]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : [42, 18446744073709551615]}'), 'd', 'Dynamic') as d, dynamicType(d); +select JSONExtract(materialize('{"d" : [-42, 18446744073709551615]}'), 'd', 'Dynamic') as d, dynamicType(d); From 5fe594243a4fc281cf3ee878b2f79b09dfd83970 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 3 Jul 2024 13:53:30 +0000 Subject: [PATCH 07/70] Remove old file --- src/Functions/FunctionsJSON.h | 1273 --------------------------------- 1 file changed, 1273 deletions(-) delete mode 100644 src/Functions/FunctionsJSON.h diff --git a/src/Functions/FunctionsJSON.h b/src/Functions/FunctionsJSON.h deleted file mode 100644 index 5d44e22300d..00000000000 --- a/src/Functions/FunctionsJSON.h +++ /dev/null @@ -1,1273 +0,0 @@ -//#pragma once -// -//#include -//#include -// -//#include -// -//#include -// -//#include -//#include -//#include -// -//#include -////#include -// -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -// -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -// -//#include -//#include -//#include -//#include -//#include -// -//#include -//#include -// -// -//#include "config.h" -// -// -//namespace DB -//{ -// -//namespace ErrorCodes -//{ -// extern const int ILLEGAL_TYPE_OF_ARGUMENT; -// extern const int ILLEGAL_COLUMN; -// extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; -//} -// -//template -//concept HasIndexOperator = requires (T t) -//{ -// t[0]; -//}; -// -///// Functions to parse JSONs and extract values from it. -///// The first argument of all these functions gets a JSON, -///// after that there are any number of arguments specifying path to a desired part from the JSON's root. -///// For example, -///// select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 -// -//class FunctionJSONHelpers -//{ -//public: -// template typename Impl, class JSONParser> -// class Executor -// { -// public: -// static ColumnPtr run(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) -// { -// MutableColumnPtr to{result_type->createColumn()}; -// to->reserve(input_rows_count); -// -// if (arguments.empty()) -// throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least one argument", String(Name::name)); -// -// const auto & first_column = arguments[0]; -// if (!isString(first_column.type)) -// throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, -// "The first argument of function {} should be a string containing JSON, illegal type: " -// "{}", String(Name::name), first_column.type->getName()); -// -// const ColumnPtr & arg_json = first_column.column; -// const auto * col_json_const = typeid_cast(arg_json.get()); -// const auto * col_json_string -// = typeid_cast(col_json_const ? col_json_const->getDataColumnPtr().get() : arg_json.get()); -// -// if (!col_json_string) -// throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}", arg_json->getName()); -// -// const ColumnString::Chars & chars = col_json_string->getChars(); -// const ColumnString::Offsets & offsets = col_json_string->getOffsets(); -// -// size_t num_index_arguments = Impl::getNumberOfIndexArguments(arguments); -// std::vector moves = prepareMoves(Name::name, arguments, 1, num_index_arguments); -// -// /// Preallocate memory in parser if necessary. -// JSONParser parser; -// if constexpr (has_member_function_reserve::value) -// { -// size_t max_size = calculateMaxSize(offsets); -// if (max_size) -// parser.reserve(max_size); -// } -// -// Impl impl; -// -// /// prepare() does Impl-specific preparation before handling each row. -// if constexpr (has_member_function_prepare::*)(const char *, const ColumnsWithTypeAndName &, const DataTypePtr &)>::value) -// impl.prepare(Name::name, arguments, result_type); -// -// using Element = typename JSONParser::Element; -// -// Element document; -// bool document_ok = false; -// if (col_json_const) -// { -// std::string_view json{reinterpret_cast(chars.data()), offsets[0] - 1}; -// document_ok = parser.parse(json, document); -// } -// -// for (const auto i : collections::range(0, input_rows_count)) -// { -// if (!col_json_const) -// { -// std::string_view json{reinterpret_cast(&chars[offsets[i - 1]]), offsets[i] - offsets[i - 1] - 1}; -// document_ok = parser.parse(json, document); -// } -// -// bool added_to_column = false; -// if (document_ok) -// { -// /// Perform moves. -// Element element; -// std::string_view last_key; -// bool moves_ok = performMoves(arguments, i, document, moves, element, last_key); -// -// if (moves_ok) -// added_to_column = impl.insertResultToColumn(*to, element, last_key); -// } -// -// /// We add default value (=null or zero) if something goes wrong, we don't throw exceptions in these JSON functions. -// if (!added_to_column) -// to->insertDefault(); -// } -// return to; -// } -// }; -// -//private: -// BOOST_TTI_HAS_MEMBER_FUNCTION(reserve) -// BOOST_TTI_HAS_MEMBER_FUNCTION(prepare) -// -// /// Represents a move of a JSON iterator described by a single argument passed to a JSON function. -// /// For example, the call JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) -// /// contains two moves: {MoveType::ConstKey, "b"} and {MoveType::ConstIndex, 1}. -// /// Keys and indices can be nonconst, in this case they are calculated for each row. -// enum class MoveType : uint8_t -// { -// Key, -// Index, -// ConstKey, -// ConstIndex, -// }; -// -// struct Move -// { -// explicit Move(MoveType type_, size_t index_ = 0) : type(type_), index(index_) {} -// Move(MoveType type_, const String & key_) : type(type_), key(key_) {} -// MoveType type; -// size_t index = 0; -// String key; -// }; -// -// static std::vector prepareMoves( -// const char * function_name, -// const ColumnsWithTypeAndName & columns, -// size_t first_index_argument, -// size_t num_index_arguments) -// { -// std::vector moves; -// moves.reserve(num_index_arguments); -// for (const auto i : collections::range(first_index_argument, first_index_argument + num_index_arguments)) -// { -// const auto & column = columns[i]; -// if (!isString(column.type) && !isNativeInteger(column.type)) -// throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, -// "The argument {} of function {} should be a string specifying key " -// "or an integer specifying index, illegal type: {}", -// std::to_string(i + 1), String(function_name), column.type->getName()); -// -// if (column.column && isColumnConst(*column.column)) -// { -// const auto & column_const = assert_cast(*column.column); -// if (isString(column.type)) -// moves.emplace_back(MoveType::ConstKey, column_const.getValue()); -// else -// moves.emplace_back(MoveType::ConstIndex, column_const.getInt(0)); -// } -// else -// { -// if (isString(column.type)) -// moves.emplace_back(MoveType::Key, ""); -// else -// moves.emplace_back(MoveType::Index, 0); -// } -// } -// return moves; -// } -// -// -// /// Performs moves of types MoveType::Index and MoveType::ConstIndex. -// template -// static bool performMoves(const ColumnsWithTypeAndName & arguments, size_t row, -// const typename JSONParser::Element & document, const std::vector & moves, -// typename JSONParser::Element & element, std::string_view & last_key) -// { -// typename JSONParser::Element res_element = document; -// std::string_view key; -// -// for (size_t j = 0; j != moves.size(); ++j) -// { -// switch (moves[j].type) -// { -// case MoveType::ConstIndex: -// { -// if (!moveToElementByIndex(res_element, static_cast(moves[j].index), key)) -// return false; -// break; -// } -// case MoveType::ConstKey: -// { -// key = moves[j].key; -// if (!moveToElementByKey(res_element, key)) -// return false; -// break; -// } -// case MoveType::Index: -// { -// Int64 index = (*arguments[j + 1].column)[row].get(); -// if (!moveToElementByIndex(res_element, static_cast(index), key)) -// return false; -// break; -// } -// case MoveType::Key: -// { -// key = arguments[j + 1].column->getDataAt(row).toView(); -// if (!moveToElementByKey(res_element, key)) -// return false; -// break; -// } -// } -// } -// -// element = res_element; -// last_key = key; -// return true; -// } -// -// template -// static bool moveToElementByIndex(typename JSONParser::Element & element, int index, std::string_view & out_key) -// { -// if (element.isArray()) -// { -// auto array = element.getArray(); -// if (index >= 0) -// --index; -// else -// index += array.size(); -// -// if (static_cast(index) >= array.size()) -// return false; -// element = array[index]; -// out_key = {}; -// return true; -// } -// -// if constexpr (HasIndexOperator) -// { -// if (element.isObject()) -// { -// auto object = element.getObject(); -// if (index >= 0) -// --index; -// else -// index += object.size(); -// -// if (static_cast(index) >= object.size()) -// return false; -// std::tie(out_key, element) = object[index]; -// return true; -// } -// } -// -// return {}; -// } -// -// /// Performs moves of types MoveType::Key and MoveType::ConstKey. -// template -// static bool moveToElementByKey(typename JSONParser::Element & element, std::string_view key) -// { -// if (!element.isObject()) -// return false; -// auto object = element.getObject(); -// return object.find(key, element); -// } -// -// static size_t calculateMaxSize(const ColumnString::Offsets & offsets) -// { -// size_t max_size = 0; -// for (const auto i : collections::range(0, offsets.size())) -// { -// size_t size = offsets[i] - offsets[i - 1]; -// max_size = std::max(max_size, size); -// } -// if (max_size) -// --max_size; -// return max_size; -// } -// -//}; -// -//template -//class JSONExtractImpl; -// -//template -//class JSONExtractKeysAndValuesImpl; -// -///** -//* Functions JSONExtract and JSONExtractKeysAndValues force the return type - it is specified in the last argument. -//* For example - `SELECT JSONExtract(materialize('{"a": 131231, "b": 1234}'), 'b', 'LowCardinality(FixedString(4))')` -//* But by default ClickHouse decides on its own whether the return type will be LowCardinality based on the types of -//* input arguments. -//* And for these specific functions we cannot rely on this mechanism, so these functions have their own implementation - -//* just convert all of the LowCardinality input columns to full ones, execute and wrap the resulting column in LowCardinality -//* if needed. -//*/ -//template typename Impl> -//constexpr bool functionForcesTheReturnType() -//{ -// return std::is_same_v, JSONExtractImpl> || std::is_same_v, JSONExtractKeysAndValuesImpl>; -//} -// -//template typename Impl> -//class ExecutableFunctionJSON : public IExecutableFunction -//{ -// -//public: -// explicit ExecutableFunctionJSON(const NullPresence & null_presence_, bool allow_simdjson_, const DataTypePtr & json_return_type_) -// : null_presence(null_presence_), allow_simdjson(allow_simdjson_), json_return_type(json_return_type_) -// { -// } -// -// String getName() const override { return Name::name; } -// bool useDefaultImplementationForNulls() const override { return false; } -// bool useDefaultImplementationForConstants() const override { return true; } -// bool useDefaultImplementationForLowCardinalityColumns() const override -// { -// return !functionForcesTheReturnType(); -// } -// -// ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override -// { -// if (null_presence.has_null_constant) -// return result_type->createColumnConstWithDefaultValue(input_rows_count); -// -// if constexpr (functionForcesTheReturnType()) -// { -// ColumnsWithTypeAndName columns_without_low_cardinality = arguments; -// -// for (auto & column : columns_without_low_cardinality) -// { -// column.column = recursiveRemoveLowCardinality(column.column); -// column.type = recursiveRemoveLowCardinality(column.type); -// } -// -// ColumnsWithTypeAndName temporary_columns = null_presence.has_nullable ? createBlockWithNestedColumns(columns_without_low_cardinality) : columns_without_low_cardinality; -// ColumnPtr temporary_result = chooseAndRunJSONParser(temporary_columns, json_return_type, input_rows_count); -// -// if (null_presence.has_nullable) -// temporary_result = wrapInNullable(temporary_result, columns_without_low_cardinality, result_type, input_rows_count); -// -// if (result_type->lowCardinality()) -// temporary_result = recursiveLowCardinalityTypeConversion(temporary_result, json_return_type, result_type); -// -// return temporary_result; -// } -// else -// { -// ColumnsWithTypeAndName temporary_columns = null_presence.has_nullable ? createBlockWithNestedColumns(arguments) : arguments; -// ColumnPtr temporary_result = chooseAndRunJSONParser(temporary_columns, json_return_type, input_rows_count); -// -// if (null_presence.has_nullable) -// temporary_result = wrapInNullable(temporary_result, arguments, result_type, input_rows_count); -// -// if (result_type->lowCardinality()) -// temporary_result = recursiveLowCardinalityTypeConversion(temporary_result, json_return_type, result_type); -// -// return temporary_result; -// } -// } -// -//private: -// -// ColumnPtr -// chooseAndRunJSONParser(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const -// { -//#if USE_SIMDJSON -// if (allow_simdjson) -// return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); -//#endif -// -//#if USE_RAPIDJSON -// return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); -//#else -// return FunctionJSONHelpers::Executor::run(arguments, result_type, input_rows_count); -//#endif -// } -// -// NullPresence null_presence; -// bool allow_simdjson; -// DataTypePtr json_return_type; -//}; -// -// -//template typename Impl> -//class FunctionBaseFunctionJSON : public IFunctionBase -//{ -//public: -// explicit FunctionBaseFunctionJSON( -// const NullPresence & null_presence_, -// bool allow_simdjson_, -// DataTypes argument_types_, -// DataTypePtr return_type_, -// DataTypePtr json_return_type_) -// : null_presence(null_presence_) -// , allow_simdjson(allow_simdjson_) -// , argument_types(std::move(argument_types_)) -// , return_type(std::move(return_type_)) -// , json_return_type(std::move(json_return_type_)) -// { -// } -// -// String getName() const override { return Name::name; } -// -// const DataTypes & getArgumentTypes() const override -// { -// return argument_types; -// } -// -// const DataTypePtr & getResultType() const override -// { -// return return_type; -// } -// -// bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } -// -// ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName &) const override -// { -// return std::make_unique>(null_presence, allow_simdjson, json_return_type); -// } -// -//private: -// NullPresence null_presence; -// bool allow_simdjson; -// DataTypes argument_types; -// DataTypePtr return_type; -// DataTypePtr json_return_type; -//}; -// -///// We use IFunctionOverloadResolver instead of IFunction to handle non-default NULL processing. -///// Both NULL and JSON NULL should generate NULL value. If any argument is NULL, return NULL. -//template typename Impl> -//class JSONOverloadResolver : public IFunctionOverloadResolver, WithContext -//{ -//public: -// static constexpr auto name = Name::name; -// -// String getName() const override { return name; } -// -// static FunctionOverloadResolverPtr create(ContextPtr context_) -// { -// return std::make_unique(context_); -// } -// -// explicit JSONOverloadResolver(ContextPtr context_) : WithContext(context_) {} -// -// bool isVariadic() const override { return true; } -// size_t getNumberOfArguments() const override { return 0; } -// bool useDefaultImplementationForNulls() const override { return false; } -// bool useDefaultImplementationForLowCardinalityColumns() const override -// { -// return !functionForcesTheReturnType(); -// } -// -// FunctionBasePtr build(const ColumnsWithTypeAndName & arguments) const override -// { -// bool has_nothing_argument = false; -// for (const auto & arg : arguments) -// has_nothing_argument |= isNothing(arg.type); -// -// DataTypePtr json_return_type = Impl::getReturnType(Name::name, createBlockWithNestedColumns(arguments)); -// NullPresence null_presence = getNullPresense(arguments); -// DataTypePtr return_type; -// if (has_nothing_argument) -// return_type = std::make_shared(); -// else if (null_presence.has_null_constant) -// return_type = makeNullable(std::make_shared()); -// else if (null_presence.has_nullable) -// return_type = makeNullable(json_return_type); -// else -// return_type = json_return_type; -// -// /// Top-level LowCardinality columns are processed outside JSON parser. -// json_return_type = removeLowCardinality(json_return_type); -// -// DataTypes argument_types; -// argument_types.reserve(arguments.size()); -// for (const auto & argument : arguments) -// argument_types.emplace_back(argument.type); -// return std::make_unique>( -// null_presence, getContext()->getSettingsRef().allow_simdjson, argument_types, return_type, json_return_type); -// } -//}; -// -//struct NameJSONHas { static constexpr auto name{"JSONHas"}; }; -//struct NameIsValidJSON { static constexpr auto name{"isValidJSON"}; }; -//struct NameJSONLength { static constexpr auto name{"JSONLength"}; }; -//struct NameJSONKey { static constexpr auto name{"JSONKey"}; }; -//struct NameJSONType { static constexpr auto name{"JSONType"}; }; -//struct NameJSONExtractInt { static constexpr auto name{"JSONExtractInt"}; }; -//struct NameJSONExtractUInt { static constexpr auto name{"JSONExtractUInt"}; }; -//struct NameJSONExtractFloat { static constexpr auto name{"JSONExtractFloat"}; }; -//struct NameJSONExtractBool { static constexpr auto name{"JSONExtractBool"}; }; -//struct NameJSONExtractString { static constexpr auto name{"JSONExtractString"}; }; -//struct NameJSONExtract { static constexpr auto name{"JSONExtract"}; }; -//struct NameJSONExtractKeysAndValues { static constexpr auto name{"JSONExtractKeysAndValues"}; }; -//struct NameJSONExtractRaw { static constexpr auto name{"JSONExtractRaw"}; }; -//struct NameJSONExtractArrayRaw { static constexpr auto name{"JSONExtractArrayRaw"}; }; -//struct NameJSONExtractKeysAndValuesRaw { static constexpr auto name{"JSONExtractKeysAndValuesRaw"}; }; -//struct NameJSONExtractKeys { static constexpr auto name{"JSONExtractKeys"}; }; -// -// -//template -//class JSONHasImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) { return std::make_shared(); } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view) -// { -// ColumnVector & col_vec = assert_cast &>(dest); -// col_vec.insertValue(1); -// return true; -// } -//}; -// -// -//template -//class IsValidJSONImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) -// { -// if (arguments.size() != 1) -// { -// /// IsValidJSON() shouldn't get parameters other than JSON. -// throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} needs exactly one argument", -// String(function_name)); -// } -// return std::make_shared(); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName &) { return 0; } -// -// static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view) -// { -// /// This function is called only if JSON is valid. -// /// If JSON isn't valid then `FunctionJSON::Executor::run()` adds default value (=zero) to `dest` without calling this function. -// ColumnVector & col_vec = assert_cast &>(dest); -// col_vec.insertValue(1); -// return true; -// } -//}; -// -// -//template -//class JSONLengthImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) -// { -// return std::make_shared(); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// size_t size; -// if (element.isArray()) -// size = element.getArray().size(); -// else if (element.isObject()) -// size = element.getObject().size(); -// else -// return false; -// -// ColumnVector & col_vec = assert_cast &>(dest); -// col_vec.insertValue(size); -// return true; -// } -//}; -// -// -//template -//class JSONKeyImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) -// { -// return std::make_shared(); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// static bool insertResultToColumn(IColumn & dest, const Element &, std::string_view last_key) -// { -// if (last_key.empty()) -// return false; -// ColumnString & col_str = assert_cast(dest); -// col_str.insertData(last_key.data(), last_key.size()); -// return true; -// } -//}; -// -// -//template -//class JSONTypeImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) -// { -// static const std::vector> values = { -// {"Array", '['}, -// {"Object", '{'}, -// {"String", '"'}, -// {"Int64", 'i'}, -// {"UInt64", 'u'}, -// {"Double", 'd'}, -// {"Bool", 'b'}, -// {"Null", 0}, /// the default value for the column. -// }; -// return std::make_shared>(values); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// UInt8 type; -// switch (element.type()) -// { -// case ElementType::INT64: -// type = 'i'; -// break; -// case ElementType::UINT64: -// type = 'u'; -// break; -// case ElementType::DOUBLE: -// type = 'd'; -// break; -// case ElementType::STRING: -// type = '"'; -// break; -// case ElementType::ARRAY: -// type = '['; -// break; -// case ElementType::OBJECT: -// type = '{'; -// break; -// case ElementType::BOOL: -// type = 'b'; -// break; -// case ElementType::NULL_VALUE: -// type = 0; -// break; -// } -// -// ColumnVector & col_vec = assert_cast &>(dest); -// col_vec.insertValue(type); -// return true; -// } -//}; -// -// -//template -//class JSONExtractNumericImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) -// { -// return std::make_shared>(); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// NumberType value; -// -// switch (element.type()) -// { -// case ElementType::DOUBLE: -// if constexpr (std::is_floating_point_v) -// { -// /// We permit inaccurate conversion of double to float. -// /// Example: double 0.1 from JSON is not representable in float. -// /// But it will be more convenient for user to perform conversion. -// value = static_cast(element.getDouble()); -// } -// else if (!accurate::convertNumeric(element.getDouble(), value)) -// return false; -// break; -// case ElementType::UINT64: -// if (!accurate::convertNumeric(element.getUInt64(), value)) -// return false; -// break; -// case ElementType::INT64: -// if (!accurate::convertNumeric(element.getInt64(), value)) -// return false; -// break; -// case ElementType::BOOL: -// if constexpr (is_integer && convert_bool_to_integer) -// { -// value = static_cast(element.getBool()); -// break; -// } -// return false; -// case ElementType::STRING: -// { -// auto rb = ReadBufferFromMemory{element.getString()}; -// if constexpr (std::is_floating_point_v) -// { -// if (!tryReadFloatText(value, rb) || !rb.eof()) -// return false; -// } -// else -// { -// if (tryReadIntText(value, rb) && rb.eof()) -// break; -// -// /// Try to parse float and convert it to integer. -// Float64 tmp_float; -// rb.position() = rb.buffer().begin(); -// if (!tryReadFloatText(tmp_float, rb) || !rb.eof()) -// return false; -// -// if (!accurate::convertNumeric(tmp_float, value)) -// return false; -// } -// break; -// } -// default: -// return false; -// } -// -// if (dest.getDataType() == TypeIndex::LowCardinality) -// { -// ColumnLowCardinality & col_low = assert_cast(dest); -// col_low.insertData(reinterpret_cast(&value), sizeof(value)); -// } -// else -// { -// auto & col_vec = assert_cast &>(dest); -// col_vec.insertValue(value); -// } -// return true; -// } -//}; -// -// -//template -//using JSONExtractInt64Impl = JSONExtractNumericImpl; -//template -//using JSONExtractUInt64Impl = JSONExtractNumericImpl; -//template -//using JSONExtractFloat64Impl = JSONExtractNumericImpl; -// -// -//template -//class JSONExtractBoolImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) -// { -// return std::make_shared(); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// bool value; -// switch (element.type()) -// { -// case ElementType::BOOL: -// value = element.getBool(); -// break; -// case ElementType::INT64: -// value = element.getInt64() != 0; -// break; -// case ElementType::UINT64: -// value = element.getUInt64() != 0; -// break; -// default: -// return false; -// } -// -// auto & col_vec = assert_cast &>(dest); -// col_vec.insertValue(static_cast(value)); -// return true; -// } -//}; -// -//template -//class JSONExtractRawImpl; -// -//template -//class JSONExtractStringImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) -// { -// return std::make_shared(); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// if (element.isNull()) -// return false; -// -// if (!element.isString()) -// return JSONExtractRawImpl::insertResultToColumn(dest, element, {}); -// -// auto str = element.getString(); -// -// if (dest.getDataType() == TypeIndex::LowCardinality) -// { -// ColumnLowCardinality & col_low = assert_cast(dest); -// col_low.insertData(str.data(), str.size()); -// } -// else -// { -// ColumnString & col_str = assert_cast(dest); -// col_str.insertData(str.data(), str.size()); -// } -// return true; -// } -//}; -// -// -//static const JSONExtractInsertSettings & getJSONExtractInsertSettings() -//{ -// static const JSONExtractInsertSettings instance = [] -// { -// JSONExtractInsertSettings settings; -// settings.insert_null_as_default = false; -// settings.insert_default_on_invalid_elements_in_complex_types = true; -// return settings; -// }(); -// return instance; -//} -// -//template -//class JSONExtractImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) -// { -// if (arguments.size() < 2) -// throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); -// -// const auto & col = arguments.back(); -// const auto * col_type_const = typeid_cast(col.column.get()); -// if (!col_type_const || !isString(col.type)) -// throw Exception(ErrorCodes::ILLEGAL_COLUMN, -// "The last argument of function {} should " -// "be a constant string specifying the return data type, illegal value: {}", -// String(function_name), col.name); -// -// return DataTypeFactory::instance().get(col_type_const->getValue()); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } -// -// void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) -// { -// extract_tree = buildJSONExtractTree(result_type, function_name); -// } -// -// bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// String error; -// return extract_tree->insertResultToColumn(dest, element, getJSONExtractInsertSettings(), error); -// } -// -//protected: -// std::unique_ptr> extract_tree; -//}; -// -// -//template -//class JSONExtractKeysAndValuesImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char * function_name, const ColumnsWithTypeAndName & arguments) -// { -// if (arguments.size() < 2) -// throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires at least two arguments", String(function_name)); -// -// const auto & col = arguments.back(); -// const auto * col_type_const = typeid_cast(col.column.get()); -// if (!col_type_const || !isString(col.type)) -// throw Exception(ErrorCodes::ILLEGAL_COLUMN, -// "The last argument of function {} should " -// "be a constant string specifying the values' data type, illegal value: {}", -// String(function_name), col.name); -// -// DataTypePtr key_type = std::make_unique(); -// DataTypePtr value_type = DataTypeFactory::instance().get(col_type_const->getValue()); -// DataTypePtr tuple_type = std::make_unique(DataTypes{key_type, value_type}); -// return std::make_unique(tuple_type); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 2; } -// -// void prepare(const char * function_name, const ColumnsWithTypeAndName &, const DataTypePtr & result_type) -// { -// const auto tuple_type = typeid_cast(result_type.get())->getNestedType(); -// const auto value_type = typeid_cast(tuple_type.get())->getElements()[1]; -// extract_tree = buildJSONExtractTree(value_type, function_name); -// } -// -// bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// if (!element.isObject()) -// return false; -// -// auto object = element.getObject(); -// -// auto & col_arr = assert_cast(dest); -// auto & col_tuple = assert_cast(col_arr.getData()); -// size_t old_size = col_tuple.size(); -// auto & col_key = assert_cast(col_tuple.getColumn(0)); -// auto & col_value = col_tuple.getColumn(1); -// -// String error; -// for (const auto & [key, value] : object) -// { -// if (extract_tree->insertResultToColumn(col_value, value, getJSONExtractInsertSettings(), error)) -// col_key.insertData(key.data(), key.size()); -// } -// -// if (col_tuple.size() == old_size) -// return false; -// -// col_arr.getOffsets().push_back(col_tuple.size()); -// return true; -// } -// -//private: -// std::unique_ptr> extract_tree; -//}; -// -// -//template -//class JSONExtractRawImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) -// { -// return std::make_shared(); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// if (dest.getDataType() == TypeIndex::LowCardinality) -// { -// ColumnString::Chars chars; -// WriteBufferFromVector buf(chars, AppendModeTag()); -// traverse(element, buf); -// buf.finalize(); -// assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); -// } -// else -// { -// ColumnString & col_str = assert_cast(dest); -// auto & chars = col_str.getChars(); -// WriteBufferFromVector buf(chars, AppendModeTag()); -// traverse(element, buf); -// buf.finalize(); -// chars.push_back(0); -// col_str.getOffsets().push_back(chars.size()); -// } -// return true; -// } -// -// // We use insertResultToFixedStringColumn in case we are inserting raw data in a FixedString column -// static bool insertResultToFixedStringColumn(IColumn & dest, const Element & element, std::string_view) -// { -// ColumnFixedString::Chars chars; -// WriteBufferFromVector buf(chars, AppendModeTag()); -// traverse(element, buf); -// buf.finalize(); -// -// auto & col_str = assert_cast(dest); -// -// if (chars.size() > col_str.getN()) -// return false; -// -// chars.resize_fill(col_str.getN()); -// col_str.insertData(reinterpret_cast(chars.data()), chars.size()); -// -// -// return true; -// } -// -// // We use insertResultToLowCardinalityFixedStringColumn in case we are inserting raw data in a Low Cardinality FixedString column -// static bool insertResultToLowCardinalityFixedStringColumn(IColumn & dest, const Element & element, size_t fixed_length) -// { -// if (element.getObject().size() > fixed_length) -// return false; -// -// ColumnFixedString::Chars chars; -// WriteBufferFromVector buf(chars, AppendModeTag()); -// traverse(element, buf); -// buf.finalize(); -// -// if (chars.size() > fixed_length) -// return false; -// chars.resize_fill(fixed_length); -// assert_cast(dest).insertData(reinterpret_cast(chars.data()), chars.size()); -// -// return true; -// } -// -//private: -// static void traverse(const Element & element, WriteBuffer & buf) -// { -// if (element.isInt64()) -// { -// writeIntText(element.getInt64(), buf); -// return; -// } -// if (element.isUInt64()) -// { -// writeIntText(element.getUInt64(), buf); -// return; -// } -// if (element.isDouble()) -// { -// writeFloatText(element.getDouble(), buf); -// return; -// } -// if (element.isBool()) -// { -// if (element.getBool()) -// writeCString("true", buf); -// else -// writeCString("false", buf); -// return; -// } -// if (element.isString()) -// { -// writeJSONString(element.getString(), buf, formatSettings()); -// return; -// } -// if (element.isArray()) -// { -// writeChar('[', buf); -// bool need_comma = false; -// for (auto value : element.getArray()) -// { -// if (std::exchange(need_comma, true)) -// writeChar(',', buf); -// traverse(value, buf); -// } -// writeChar(']', buf); -// return; -// } -// if (element.isObject()) -// { -// writeChar('{', buf); -// bool need_comma = false; -// for (auto [key, value] : element.getObject()) -// { -// if (std::exchange(need_comma, true)) -// writeChar(',', buf); -// writeJSONString(key, buf, formatSettings()); -// writeChar(':', buf); -// traverse(value, buf); -// } -// writeChar('}', buf); -// return; -// } -// if (element.isNull()) -// { -// writeCString("null", buf); -// return; -// } -// } -// -// static const FormatSettings & formatSettings() -// { -// static const FormatSettings the_instance = [] -// { -// FormatSettings settings; -// settings.json.escape_forward_slashes = false; -// return settings; -// }(); -// return the_instance; -// } -//}; -// -// -//template -//class JSONExtractArrayRawImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) -// { -// return std::make_shared(std::make_shared()); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// if (!element.isArray()) -// return false; -// -// auto array = element.getArray(); -// ColumnArray & col_res = assert_cast(dest); -// -// for (auto value : array) -// JSONExtractRawImpl::insertResultToColumn(col_res.getData(), value, {}); -// -// col_res.getOffsets().push_back(col_res.getOffsets().back() + array.size()); -// return true; -// } -//}; -// -// -//template -//class JSONExtractKeysAndValuesRawImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) -// { -// DataTypePtr string_type = std::make_unique(); -// DataTypePtr tuple_type = std::make_unique(DataTypes{string_type, string_type}); -// return std::make_unique(tuple_type); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// if (!element.isObject()) -// return false; -// -// auto object = element.getObject(); -// -// auto & col_arr = assert_cast(dest); -// auto & col_tuple = assert_cast(col_arr.getData()); -// auto & col_key = assert_cast(col_tuple.getColumn(0)); -// auto & col_value = assert_cast(col_tuple.getColumn(1)); -// -// for (const auto & [key, value] : object) -// { -// col_key.insertData(key.data(), key.size()); -// JSONExtractRawImpl::insertResultToColumn(col_value, value, {}); -// } -// -// col_arr.getOffsets().push_back(col_arr.getOffsets().back() + object.size()); -// return true; -// } -//}; -// -//template -//class JSONExtractKeysImpl -//{ -//public: -// using Element = typename JSONParser::Element; -// -// static DataTypePtr getReturnType(const char *, const ColumnsWithTypeAndName &) -// { -// return std::make_unique(std::make_shared()); -// } -// -// static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } -// -// bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view) -// { -// if (!element.isObject()) -// return false; -// -// auto object = element.getObject(); -// -// ColumnArray & col_res = assert_cast(dest); -// auto & col_key = assert_cast(col_res.getData()); -// -// for (const auto & [key, value] : object) -// { -// col_key.insertData(key.data(), key.size()); -// } -// -// col_res.getOffsets().push_back(col_res.getOffsets().back() + object.size()); -// return true; -// } -//}; -// -//} From 63303dd79893ace08ce2ed4be6bfee422287d44b Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 3 Jul 2024 14:03:04 +0000 Subject: [PATCH 08/70] Fix style --- src/Formats/JSONExtractTree.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Formats/JSONExtractTree.cpp b/src/Formats/JSONExtractTree.cpp index 6d019f96ba6..18437c16bc9 100644 --- a/src/Formats/JSONExtractTree.cpp +++ b/src/Formats/JSONExtractTree.cpp @@ -50,6 +50,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + template void jsonElementToString(const typename JSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings) { @@ -207,7 +212,7 @@ namespace { template -String jsonElementToString(const typename JSONParser::Element & element, const FormatSettings & format_settings) +String jsonElementToString(const typename JSONParser::Element & element, const FormatSettings & format_settings) { WriteBufferFromOwnString buf; jsonElementToString(element, buf, format_settings); @@ -1440,7 +1445,7 @@ std::unique_ptr> buildJSONExtractTree(const Data case TypeIndex::Date:; return std::make_unique>(); case TypeIndex::Date32: - return std::make_unique>(); + return std::make_unique>(); case TypeIndex::DateTime: return std::make_unique>(assert_cast(*type)); case TypeIndex::DateTime64: From d03fcb5ff121203f9cd6bf729df98764593328fe Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 3 Jul 2024 14:23:38 +0000 Subject: [PATCH 09/70] Fix --- src/Formats/SchemaInferenceUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index 6519d54a8c5..f2ad1dc6717 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -271,7 +271,7 @@ namespace { if (WhichDataType(type).isInt64()) { - bool is_negative = json_info->negative_integers.contains(type.get()); + bool is_negative = json_info && json_info->negative_integers.contains(type.get()); have_negative_integers |= is_negative; if (!is_negative) type = std::make_shared(); From 6530ae104d16ffbda51cf849b5f89e3d4080d2af Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 3 Jul 2024 15:23:01 +0000 Subject: [PATCH 10/70] Fix tests --- src/Formats/SchemaInferenceUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index f2ad1dc6717..3c374ada9e6 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -296,7 +296,7 @@ namespace if (which.isInt64() || which.isUInt64()) { auto new_type = std::make_shared(); - if (json_info->numbers_parsed_from_json_strings.erase(type.get())) + if (json_info && json_info->numbers_parsed_from_json_strings.erase(type.get())) json_info->numbers_parsed_from_json_strings.insert(new_type.get()); type = new_type; } From a5adf31b9e4dfa041150fa263ab68f32cb47122c Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 3 Jul 2024 19:30:36 +0200 Subject: [PATCH 11/70] Fix special build --- src/Formats/JSONExtractTree.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Formats/JSONExtractTree.cpp b/src/Formats/JSONExtractTree.cpp index 18437c16bc9..b94981e7cb4 100644 --- a/src/Formats/JSONExtractTree.cpp +++ b/src/Formats/JSONExtractTree.cpp @@ -1465,9 +1465,9 @@ std::unique_ptr> buildJSONExtractTree(const Data case TypeIndex::LowCardinality: { /// To optimize inserting into LowCardinality we have special nodes for LowCardinality of numeric and string types. - auto lc_type = typeid_cast(type.get()); - auto dictionary_type = removeNullable(lc_type->getDictionaryType()); - bool is_nullable = lc_type->isLowCardinalityNullable(); + const auto & lc_type = assert_cast(*type)); + auto dictionary_type = removeNullable(lc_type.getDictionaryType()); + bool is_nullable = lc_type.isLowCardinalityNullable(); switch (dictionary_type->getTypeId()) { From 41b9216dd1d862b46ed72d50e899197e2fec9daa Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 4 Jul 2024 00:22:41 +0200 Subject: [PATCH 12/70] Fix build --- src/Formats/JSONExtractTree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/JSONExtractTree.cpp b/src/Formats/JSONExtractTree.cpp index b94981e7cb4..827f276311a 100644 --- a/src/Formats/JSONExtractTree.cpp +++ b/src/Formats/JSONExtractTree.cpp @@ -1465,7 +1465,7 @@ std::unique_ptr> buildJSONExtractTree(const Data case TypeIndex::LowCardinality: { /// To optimize inserting into LowCardinality we have special nodes for LowCardinality of numeric and string types. - const auto & lc_type = assert_cast(*type)); + const auto & lc_type = assert_cast(*type); auto dictionary_type = removeNullable(lc_type.getDictionaryType()); bool is_nullable = lc_type.isLowCardinalityNullable(); From 3776fafc881bf4725bfbb356e6e81df66ad336b6 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 4 Jul 2024 13:44:44 +0000 Subject: [PATCH 13/70] Print stacktrace in case of about after logical error. --- src/Common/Exception.cpp | 9 ++++++--- src/Common/StackTrace.cpp | 2 +- src/Common/StackTrace.h | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index 1f4b0aea8f2..181b4f1488e 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -38,9 +38,12 @@ namespace ErrorCodes extern const int CANNOT_MREMAP; } -void abortOnFailedAssertion(const String & description) +void abortOnFailedAssertion(const String & description, const Exception::FramePointers * trace = nullptr) { - LOG_FATAL(&Poco::Logger::root(), "Logical error: '{}'.", description); + auto & logger = Poco::Logger::root(); + LOG_FATAL(&logger, "Logical error: '{}'.", description); + if (trace) + LOG_FATAL(&logger, "Stack trace (when copying this message, always include the lines below):\n\n{}", StackTrace::toString(trace->data(), 0, trace->size())); abort(); } @@ -58,7 +61,7 @@ void handle_error_code(const std::string & msg, int code, bool remote, const Exc #ifdef ABORT_ON_LOGICAL_ERROR if (code == ErrorCodes::LOGICAL_ERROR) { - abortOnFailedAssertion(msg); + abortOnFailedAssertion(msg, &trace); } #endif diff --git a/src/Common/StackTrace.cpp b/src/Common/StackTrace.cpp index 239e957bdfe..34f6f0b7535 100644 --- a/src/Common/StackTrace.cpp +++ b/src/Common/StackTrace.cpp @@ -545,7 +545,7 @@ std::string StackTrace::toString() const return toStringCached(frame_pointers, offset, size); } -std::string StackTrace::toString(void ** frame_pointers_raw, size_t offset, size_t size) +std::string StackTrace::toString(void * const * frame_pointers_raw, size_t offset, size_t size) { __msan_unpoison(frame_pointers_raw, size * sizeof(*frame_pointers_raw)); diff --git a/src/Common/StackTrace.h b/src/Common/StackTrace.h index 4ce9a9281f3..2078828f3d7 100644 --- a/src/Common/StackTrace.h +++ b/src/Common/StackTrace.h @@ -59,7 +59,7 @@ public: const FramePointers & getFramePointers() const { return frame_pointers; } std::string toString() const; - static std::string toString(void ** frame_pointers, size_t offset, size_t size); + static std::string toString(void * const * frame_pointers, size_t offset, size_t size); static void dropCache(); /// @param fatal - if true, will process inline frames (slower) From 4271b2b6e3d6940603ef0d1836fbabf42b092d65 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 4 Jul 2024 16:29:32 +0000 Subject: [PATCH 14/70] Add noreturn/ --- src/Common/Exception.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index 181b4f1488e..07bda6a75be 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -38,7 +38,7 @@ namespace ErrorCodes extern const int CANNOT_MREMAP; } -void abortOnFailedAssertion(const String & description, const Exception::FramePointers * trace = nullptr) +[[noreturn]] void abortOnFailedAssertion(const String & description, const Exception::FramePointers * trace = nullptr) { auto & logger = Poco::Logger::root(); LOG_FATAL(&logger, "Logical error: '{}'.", description); From f556f2cd9529acfdf796df91c20abec0ce405a95 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 4 Jul 2024 18:28:22 +0000 Subject: [PATCH 15/70] Try to fix special build --- src/Formats/JSONExtractTree.cpp | 1 + src/Functions/FunctionsJSON.cpp | 13 ++----------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/Formats/JSONExtractTree.cpp b/src/Formats/JSONExtractTree.cpp index 827f276311a..8fe472930d3 100644 --- a/src/Formats/JSONExtractTree.cpp +++ b/src/Formats/JSONExtractTree.cpp @@ -1558,6 +1558,7 @@ template std::unique_ptr> buildJSONExtractTr #if USE_RAPIDJSON template void jsonElementToString(const RapidJSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings); template std::unique_ptr> buildJSONExtractTree(const DataTypePtr & type, const char * source_for_exception_message); +template bool tryGetNumericValueFromJSONElement(Float64 & value, const RapidJSONParser::Element & element, bool convert_bool_to_integer, String & error); #else template void jsonElementToString(const DummyJSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings); template std::unique_ptr> buildJSONExtractTree(const DataTypePtr & type, const char * source_for_exception_message); diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index c6af0674db7..ca233becb63 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -736,17 +736,8 @@ public: NumberType value; tryGetNumericValueFromJSONElement(value, element, convert_bool_to_integer, error); - - if (dest.getDataType() == TypeIndex::LowCardinality) - { - ColumnLowCardinality & col_low = assert_cast(dest); - col_low.insertData(reinterpret_cast(&value), sizeof(value)); - } - else - { - auto & col_vec = assert_cast &>(dest); - col_vec.insertValue(value); - } + auto & col_vec = assert_cast &>(dest); + col_vec.insertValue(value); return true; } }; From 64ef36dab362094e0bfa4a32e09b830502eb2c56 Mon Sep 17 00:00:00 2001 From: Konstantin Morozov Date: Fri, 5 Jul 2024 11:19:06 +0000 Subject: [PATCH 16/70] fix deadlock --- src/Databases/DatabaseAtomic.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index b30b05bb7a7..a48eb2abce6 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -106,12 +106,17 @@ void DatabaseAtomic::attachTable(ContextPtr /* context_ */, const String & name, StoragePtr DatabaseAtomic::detachTable(ContextPtr /* context */, const String & name) { + // it is important to call destructures not_in_use without + // blocking mutex for avoid potential deadlock. DetachedTables not_in_use; - std::lock_guard lock(mutex); - auto table = DatabaseOrdinary::detachTableUnlocked(name); - table_name_to_path.erase(name); - detached_tables.emplace(table->getStorageID().uuid, table); - not_in_use = cleanupDetachedTables(); + StoragePtr table; + { + std::lock_guard lock(mutex); + table = DatabaseOrdinary::detachTableUnlocked(name); + table_name_to_path.erase(name); + detached_tables.emplace(table->getStorageID().uuid, table); + not_in_use = cleanupDetachedTables(); + } if (!not_in_use.empty()) { From 17e089c490efbf1ac4224fa8cf0b74bb3f50739a Mon Sep 17 00:00:00 2001 From: zhongyuankai <872237106@qq.com> Date: Sun, 7 Jul 2024 18:22:55 +0800 Subject: [PATCH 17/70] Refactor `OptimizeIfWithConstantConditionVisitor` using `InDepthNodeVisitor` --- ...OptimizeIfWithConstantConditionVisitor.cpp | 93 ++++++++----------- .../OptimizeIfWithConstantConditionVisitor.h | 17 ++-- src/Interpreters/TreeOptimizer.cpp | 3 +- 3 files changed, 52 insertions(+), 61 deletions(-) diff --git a/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp b/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp index 20451fb20ad..48c9988b6fc 100644 --- a/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp +++ b/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp @@ -73,66 +73,55 @@ static bool tryExtractConstValueFromCondition(const ASTPtr & condition, bool & v return false; } -void OptimizeIfWithConstantConditionVisitor::visit(ASTPtr & current_ast) +void OptimizeIfWithConstantConditionVisitorData::visit(ASTFunction & function_node, ASTPtr & ast) { - if (!current_ast) - return; - checkStackSize(); - for (ASTPtr & child : current_ast->children) + if (function_node.name != "if") + return; + + if (!function_node.arguments) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Wrong number of arguments for function 'if' (0 instead of 3)"); + + if (function_node.arguments->children.size() != 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Wrong number of arguments for function 'if' ({} instead of 3)", + function_node.arguments->children.size()); + + const auto * args = function_node.arguments->as(); + + ASTPtr condition_expr = args->children[0]; + ASTPtr then_expr = args->children[1]; + ASTPtr else_expr = args->children[2]; + + bool condition; + if (tryExtractConstValueFromCondition(condition_expr, condition)) { - auto * function_node = child->as(); - if (!function_node || function_node->name != "if") + ASTPtr replace_ast = condition ? then_expr : else_expr; + ASTPtr child_copy = ast; + String replace_alias = replace_ast->tryGetAlias(); + String if_alias = ast->tryGetAlias(); + + if (replace_alias.empty()) { - visit(child); - continue; + replace_ast->setAlias(if_alias); + ast = replace_ast; + } + else + { + /// Only copy of one node is required here. + /// But IAST has only method for deep copy of subtree. + /// This can be a reason of performance degradation in case of deep queries. + ASTPtr replace_ast_deep_copy = replace_ast->clone(); + replace_ast_deep_copy->setAlias(if_alias); + ast = replace_ast_deep_copy; } - if (!function_node->arguments) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Wrong number of arguments for function 'if' (0 instead of 3)"); - - if (function_node->arguments->children.size() != 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Wrong number of arguments for function 'if' ({} instead of 3)", - function_node->arguments->children.size()); - - visit(function_node->arguments); - const auto * args = function_node->arguments->as(); - - ASTPtr condition_expr = args->children[0]; - ASTPtr then_expr = args->children[1]; - ASTPtr else_expr = args->children[2]; - - bool condition; - if (tryExtractConstValueFromCondition(condition_expr, condition)) + if (!if_alias.empty()) { - ASTPtr replace_ast = condition ? then_expr : else_expr; - ASTPtr child_copy = child; - String replace_alias = replace_ast->tryGetAlias(); - String if_alias = child->tryGetAlias(); - - if (replace_alias.empty()) - { - replace_ast->setAlias(if_alias); - child = replace_ast; - } - else - { - /// Only copy of one node is required here. - /// But IAST has only method for deep copy of subtree. - /// This can be a reason of performance degradation in case of deep queries. - ASTPtr replace_ast_deep_copy = replace_ast->clone(); - replace_ast_deep_copy->setAlias(if_alias); - child = replace_ast_deep_copy; - } - - if (!if_alias.empty()) - { - auto alias_it = aliases.find(if_alias); - if (alias_it != aliases.end() && alias_it->second.get() == child_copy.get()) - alias_it->second = child; - } + auto alias_it = aliases.find(if_alias); + if (alias_it != aliases.end() && alias_it->second.get() == child_copy.get()) + alias_it->second = ast; } } } diff --git a/src/Interpreters/OptimizeIfWithConstantConditionVisitor.h b/src/Interpreters/OptimizeIfWithConstantConditionVisitor.h index ad98f92bafd..3b46f90f07c 100644 --- a/src/Interpreters/OptimizeIfWithConstantConditionVisitor.h +++ b/src/Interpreters/OptimizeIfWithConstantConditionVisitor.h @@ -1,23 +1,24 @@ #pragma once #include +#include namespace DB { - -/// It removes Function_if node from AST if condition is constant. -/// TODO: rewrite with InDepthNodeVisitor -class OptimizeIfWithConstantConditionVisitor +struct OptimizeIfWithConstantConditionVisitorData { -public: - explicit OptimizeIfWithConstantConditionVisitor(Aliases & aliases_) + using TypeToVisit = ASTFunction; + + explicit OptimizeIfWithConstantConditionVisitorData(Aliases & aliases_) : aliases(aliases_) {} - void visit(ASTPtr & ast); - + void visit(ASTFunction & function_node, ASTPtr & ast); private: Aliases & aliases; }; +/// It removes Function_if node from AST if condition is constant. +using OptimizeIfWithConstantConditionVisitor = InDepthNodeVisitor, false>; + } diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index b88d75cd5a2..b872eb94fde 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -577,7 +577,8 @@ void TreeOptimizer::optimizeIf(ASTPtr & query, Aliases & aliases, bool if_chain_ optimizeMultiIfToIf(query); /// Optimize if with constant condition after constants was substituted instead of scalar subqueries. - OptimizeIfWithConstantConditionVisitor(aliases).visit(query); + OptimizeIfWithConstantConditionVisitorData visitor_data(aliases); + OptimizeIfWithConstantConditionVisitor(visitor_data).visit(query); if (if_chain_to_multiif) OptimizeIfChainsVisitor().visit(query); From 97c6cbec46f5c93c2c6199576592a9262aff56f0 Mon Sep 17 00:00:00 2001 From: Blargian Date: Sun, 7 Jul 2024 22:38:43 +0200 Subject: [PATCH 18/70] ad individual window function pages --- .../window-functions/dense_rank.md | 73 ++++++++++++++++++ .../sql-reference/window-functions/index.md | 35 ++++----- .../en/sql-reference/window-functions/rank.md | 74 +++++++++++++++++++ .../window-functions/row_number.md | 0 4 files changed, 165 insertions(+), 17 deletions(-) create mode 100644 docs/en/sql-reference/window-functions/dense_rank.md create mode 100644 docs/en/sql-reference/window-functions/rank.md create mode 100644 docs/en/sql-reference/window-functions/row_number.md diff --git a/docs/en/sql-reference/window-functions/dense_rank.md b/docs/en/sql-reference/window-functions/dense_rank.md new file mode 100644 index 00000000000..17ab894707e --- /dev/null +++ b/docs/en/sql-reference/window-functions/dense_rank.md @@ -0,0 +1,73 @@ +--- +slug: /en/sql-reference/window-functions/dense_rank +sidebar_label: dense_rank +sidebar_position: 2 +--- + +# dense_rank + +This window function ranks the current row within its partition without gaps. In other words, if the value of any new row encountered is equal to the value of one of the previous rows then it will receive the next successive rank without any gaps in ranking. + +The [rank](./rank.md) function provides the same behaviour, but with gaps in ranking. + +**Syntax** + +```sql +dense_rank (column_name) + OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] + [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) +FROM table_name +WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column]) +``` + +For more detail on window function syntax see: [Window Functions - Syntax](./index.md/#syntax). + +**Returned value** + +- A number for the current row within its partition, without gaps in ranking. [UInt64](../data-types/int-uint.md). + +**Example** + +The following example is based on the example provided in the video instructional [Ranking window functions in ClickHouse](https://youtu.be/Yku9mmBYm_4?si=XIMu1jpYucCQEoXA). + +Query: + +```sql +CREATE TABLE salaries +( + `team` String, + `player` String, + `salary` UInt32, + `position` String +) +Engine = Memory; + +INSERT INTO salaries FORMAT Values + ('Port Elizabeth Barbarians', 'Gary Chen', 195000, 'F'), + ('New Coreystad Archdukes', 'Charles Juarez', 190000, 'F'), + ('Port Elizabeth Barbarians', 'Michael Stanley', 150000, 'D'), + ('New Coreystad Archdukes', 'Scott Harrison', 150000, 'D'), + ('Port Elizabeth Barbarians', 'Robert George', 195000, 'M'), + ('South Hampton Seagulls', 'Douglas Benson', 150000, 'M'), + ('South Hampton Seagulls', 'James Henderson', 140000, 'M'); +``` + +```sql +SELECT player, salary, + dense_rank() OVER (ORDER BY salary DESC) AS dense_rank +FROM salaries; +``` + +Result: + +```response + ┌─player──────────┬─salary─┬─dense_rank─┐ +1. │ Gary Chen │ 195000 │ 1 │ +2. │ Robert George │ 195000 │ 1 │ +3. │ Charles Juarez │ 190000 │ 2 │ +4. │ Michael Stanley │ 150000 │ 3 │ +5. │ Douglas Benson │ 150000 │ 3 │ +6. │ Scott Harrison │ 150000 │ 3 │ +7. │ James Henderson │ 140000 │ 4 │ + └─────────────────┴────────┴────────────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index 3a8afd10359..a0246af610f 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -1,10 +1,11 @@ --- slug: /en/sql-reference/window-functions/ -sidebar_position: 62 sidebar_label: Window Functions -title: Window Functions +sidebar_position: 1 --- +# Window Functions + Windows functions let you perform calculations across a set of rows that are related to the current row. Some of the calculations that you can do are similar to those that can be done with an aggregate function, but a window function doesn't cause rows to be grouped into a single output - the individual rows are still returned. @@ -12,19 +13,19 @@ Some of the calculations that you can do are similar to those that can be done w ClickHouse supports the standard grammar for defining windows and window functions. The table below indicates whether a feature is currently supported. -| Feature | Supported? | -|------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Feature | Supported? | +|--------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | ad hoc window specification (`count(*) over (partition by id order by time desc)`) | ✅ | -| expressions involving window functions, e.g. `(count(*) over ()) / 2)` | ✅ | -| `WINDOW` clause (`select ... from table window w as (partition by id)`) | ✅ | -| `ROWS` frame | ✅ | -| `RANGE` frame | ✅ (the default) | -| `INTERVAL` syntax for `DateTime` `RANGE OFFSET` frame | ❌ (specify the number of seconds instead (`RANGE` works with any numeric type).) | -| `GROUPS` frame | ❌ | -| Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | ✅ (All aggregate functions are supported) | -| `rank()`, `dense_rank()`, `row_number()` | ✅ | -| `lag/lead(value, offset)` | ❌
You can use one of the following workarounds:
1) `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead`
2) `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | -| ntile(buckets) | ✅
Specify window like, (partition by x order by y rows between unbounded preceding and unrounded following). | +| expressions involving window functions, e.g. `(count(*) over ()) / 2)` | ✅ | +| `WINDOW` clause (`select ... from table window w as (partition by id)`) | ✅ | +| `ROWS` frame | ✅ | +| `RANGE` frame | ✅ (the default) | +| `INTERVAL` syntax for `DateTime` `RANGE OFFSET` frame | ❌ (specify the number of seconds instead (`RANGE` works with any numeric type).) | +| `GROUPS` frame | ❌ | +| Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | ✅ (All aggregate functions are supported) | +| `rank()`, `dense_rank()`, `row_number()` | ✅ | +| `lag/lead(value, offset)` | ❌
You can use one of the following workarounds:
1) `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead`
2) `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | +| ntile(buckets) | ✅
Specify window like, (partition by x order by y rows between unbounded preceding and unrounded following). | ## ClickHouse-specific Window Functions @@ -74,12 +75,12 @@ WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column] These functions can be used only as a window function. -- `row_number()` - Number the current row within its partition starting from 1. +- [`row_number()`](./row_number.md) - Number the current row within its partition starting from 1. - `first_value(x)` - Return the first non-NULL value evaluated within its ordered frame. - `last_value(x)` - Return the last non-NULL value evaluated within its ordered frame. - `nth_value(x, offset)` - Return the first non-NULL value evaluated against the nth row (offset) in its ordered frame. -- `rank()` - Rank the current row within its partition with gaps. -- `dense_rank()` - Rank the current row within its partition without gaps. +- [`rank()`](./rank.md) - Rank the current row within its partition with gaps. +- [`dense_rank()`](./dense_rank.md) - Rank the current row within its partition without gaps. - `lagInFrame(x)` - Return a value evaluated at the row that is at a specified physical offset row before the current row within the ordered frame. - `leadInFrame(x)` - Return a value evaluated at the row that is offset rows after the current row within the ordered frame. diff --git a/docs/en/sql-reference/window-functions/rank.md b/docs/en/sql-reference/window-functions/rank.md new file mode 100644 index 00000000000..17db889ef92 --- /dev/null +++ b/docs/en/sql-reference/window-functions/rank.md @@ -0,0 +1,74 @@ +--- +slug: /en/sql-reference/window-functions/rank +sidebar_label: rank +sidebar_position: 3 +--- + +# rank + +This window function ranks the current row within its partition with gaps. In other words, if the value of any row it encounters is equal to the value of a previous row then it will receive the same rank as that previous row. +The rank of the next row is then equal to the rank of the previous row plus a gap equal to the number of times the previous rank was given. + +The [dense_rank](./dense_rank.md) function provides the same behaviour but without gaps in ranking. + +**Syntax** + +```sql +rank (column_name) + OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] + [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) +FROM table_name +WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column]) +``` + +For more detail on window function syntax see: [Window Functions - Syntax](./index.md/#syntax). + +**Returned value** + +- A number for the current row within its partition, including gaps. [UInt64](../data-types/int-uint.md). + +**Example** + +The following example is based on the example provided in the video instructional [Ranking window functions in ClickHouse](https://youtu.be/Yku9mmBYm_4?si=XIMu1jpYucCQEoXA). + +Query: + +```sql +CREATE TABLE salaries +( + `team` String, + `player` String, + `salary` UInt32, + `position` String +) +Engine = Memory; + +INSERT INTO salaries FORMAT Values + ('Port Elizabeth Barbarians', 'Gary Chen', 195000, 'F'), + ('New Coreystad Archdukes', 'Charles Juarez', 190000, 'F'), + ('Port Elizabeth Barbarians', 'Michael Stanley', 150000, 'D'), + ('New Coreystad Archdukes', 'Scott Harrison', 150000, 'D'), + ('Port Elizabeth Barbarians', 'Robert George', 195000, 'M'), + ('South Hampton Seagulls', 'Douglas Benson', 150000, 'M'), + ('South Hampton Seagulls', 'James Henderson', 140000, 'M'); +``` + +```sql +SELECT player, salary, + rank() OVER (ORDER BY salary DESC) AS rank +FROM salaries; +``` + +Result: + +```response + ┌─player──────────┬─salary─┬─rank─┐ +1. │ Gary Chen │ 195000 │ 1 │ +2. │ Robert George │ 195000 │ 1 │ +3. │ Charles Juarez │ 190000 │ 3 │ +4. │ Douglas Benson │ 150000 │ 4 │ +5. │ Michael Stanley │ 150000 │ 4 │ +6. │ Scott Harrison │ 150000 │ 4 │ +7. │ James Henderson │ 140000 │ 7 │ + └─────────────────┴────────┴──────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/window-functions/row_number.md b/docs/en/sql-reference/window-functions/row_number.md new file mode 100644 index 00000000000..e69de29bb2d From 7b3ce3c3b38f698eb80923061c8ec0e309e2cff6 Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 8 Jul 2024 06:20:10 +0200 Subject: [PATCH 19/70] add leadInFrame, lagInFrame, row_number --- .../window-functions/lagInFrame.md | 79 +++++++++++++++++++ .../window-functions/leadInFrame.md | 60 ++++++++++++++ .../window-functions/row_number.md | 67 ++++++++++++++++ 3 files changed, 206 insertions(+) create mode 100644 docs/en/sql-reference/window-functions/lagInFrame.md create mode 100644 docs/en/sql-reference/window-functions/leadInFrame.md diff --git a/docs/en/sql-reference/window-functions/lagInFrame.md b/docs/en/sql-reference/window-functions/lagInFrame.md new file mode 100644 index 00000000000..ea9f6d9dea2 --- /dev/null +++ b/docs/en/sql-reference/window-functions/lagInFrame.md @@ -0,0 +1,79 @@ +--- +slug: /en/sql-reference/window-functions/lagInFrame +sidebar_label: lagInFrame +sidebar_position: 5 +--- + +# lagInFrame + +Return a value evaluated at the row that is at a specified physical offset before the current row within the ordered frame. The offset parameter, if not specified, defaults to 1, meaning it will fetch the value from the next row. If the calculated row exceeds the boundaries of the window frame, the specified default value is returned. + +**Syntax** + +```sql +lagInFrame(x[, offset[, default]]) + OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] + [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) +FROM table_name +WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column]) +``` + +For more detail on window function syntax see: [Window Functions - Syntax](./index.md/#syntax). + +**Parameters** +- `x` — Column name. +- `offset` — Offset to apply. [(U)Int*](../data-types/int-uint.md). (Optional - `1` by default). +- `default` — Value to return if calculated row exceeds the boundaries of the window frame. (Optional - `null` by default). + +**Returned value** + +- Value evaluated at the row that is at a specified physical offset before the current row within the ordered frame. + +**Example** + +This example looks at historical data for a specific stock and uses the `lagInFrame` function to calculate a day-to-day delta and percentage change in the closing price of the stock. + +Query: + +```sql +CREATE TABLE stock_prices +( + `date` Date, + `open` Float32, -- opening price + `high` Float32, -- daily high + `low` Float32, -- daily low + `close` Float32, -- closing price + `volume` UInt32 -- trade volume +) +Engine = Memory; + +INSERT INTO stock_prices FORMAT Values + ('2024-06-03', 113.62, 115.00, 112.00, 115.00, 438392000), + ('2024-06-04', 115.72, 116.60, 114.04, 116.44, 403324000), + ('2024-06-05', 118.37, 122.45, 117.47, 122.44, 528402000), + ('2024-06-06', 124.05, 125.59, 118.32, 121.00, 664696000), + ('2024-06-07', 119.77, 121.69, 118.02, 120.89, 412386000); +``` + +```sql +SELECT + date, + close, + lagInFrame(close, 1, close) OVER (ORDER BY date ASC) AS previous_day_close, + COALESCE(ROUND(close - previous_day_close, 2)) AS delta, + COALESCE(ROUND((delta / previous_day_close) * 100, 2)) AS percent_change +FROM stock_prices +ORDER BY date DESC; +``` + +Result: + +```response + ┌───────date─┬──close─┬─previous_day_close─┬─delta─┬─percent_change─┐ +1. │ 2024-06-07 │ 120.89 │ 121 │ -0.11 │ -0.09 │ +2. │ 2024-06-06 │ 121 │ 122.44 │ -1.44 │ -1.18 │ +3. │ 2024-06-05 │ 122.44 │ 116.44 │ 6 │ 5.15 │ +4. │ 2024-06-04 │ 116.44 │ 115 │ 1.44 │ 1.25 │ +5. │ 2024-06-03 │ 115 │ 115 │ 0 │ 0 │ + └────────────┴────────┴────────────────────┴───────┴────────────────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/window-functions/leadInFrame.md b/docs/en/sql-reference/window-functions/leadInFrame.md new file mode 100644 index 00000000000..e3b65af9a4d --- /dev/null +++ b/docs/en/sql-reference/window-functions/leadInFrame.md @@ -0,0 +1,60 @@ +--- +slug: /en/sql-reference/window-functions/leadInFrame +sidebar_label: leadInFrame +sidebar_position: 6 +--- + +# leadInFrame + +Return a value evaluated at the row that is offset rows after the current row within the ordered frame. + +**Syntax** + +```sql +leadInFrame(x[, offset[, default]]) + OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] + [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) +FROM table_name +WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column]) +``` + +For more detail on window function syntax see: [Window Functions - Syntax](./index.md/#syntax). + +**Parameters** +- `x` — Column name. +- `offset` — Offset to apply. [(U)Int*](../data-types/int-uint.md). (Optional - `1` by default). +- `default` — Value to return if calculated row exceeds the boundaries of the window frame. (Optional - `null` by default). + +**Returned value** + +- value evaluated at the row that is offset rows after the current row within the ordered frame. + +**Example** + +This example looks at [historical data](https://www.kaggle.com/datasets/sazidthe1/nobel-prize-data) for Nobel Prize winners and uses the `leadInFrame` function to return a list of successive winners in the physics category. + +Query: + +```sql +CREATE OR REPLACE VIEW nobel_prize_laureates AS FROM file('nobel_laureates_data.csv') SELECT *; +``` + +```sql +FROM nobel_prize_laureates SELECT fullName, leadInFrame(year, 1, year) OVER (PARTITION BY category ORDER BY year) AS year, category, motivation WHERE category == 'physics' ORDER BY year DESC LIMIT 9; +``` + +Result: + +```response + ┌─fullName─────────┬─year─┬─category─┬─motivation─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +1. │ Pierre Agostini │ 2023 │ physics │ for experimental methods that generate attosecond pulses of light for the study of electron dynamics in matter │ +2. │ Ferenc Krausz │ 2023 │ physics │ for experimental methods that generate attosecond pulses of light for the study of electron dynamics in matter │ +3. │ Anne L Huillier │ 2023 │ physics │ for experimental methods that generate attosecond pulses of light for the study of electron dynamics in matter │ +4. │ Alain Aspect │ 2022 │ physics │ for experiments with entangled photons establishing the violation of Bell inequalities and pioneering quantum information science │ +5. │ Anton Zeilinger │ 2022 │ physics │ for experiments with entangled photons establishing the violation of Bell inequalities and pioneering quantum information science │ +6. │ John Clauser │ 2022 │ physics │ for experiments with entangled photons establishing the violation of Bell inequalities and pioneering quantum information science │ +7. │ Syukuro Manabe │ 2021 │ physics │ for the physical modelling of Earths climate quantifying variability and reliably predicting global warming │ +8. │ Klaus Hasselmann │ 2021 │ physics │ for the physical modelling of Earths climate quantifying variability and reliably predicting global warming │ +9. │ Giorgio Parisi │ 2021 │ physics │ for the discovery of the interplay of disorder and fluctuations in physical systems from atomic to planetary scales │ + └──────────────────┴──────┴──────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/window-functions/row_number.md b/docs/en/sql-reference/window-functions/row_number.md index e69de29bb2d..428bb34a8ba 100644 --- a/docs/en/sql-reference/window-functions/row_number.md +++ b/docs/en/sql-reference/window-functions/row_number.md @@ -0,0 +1,67 @@ +--- +slug: /en/sql-reference/window-functions/row_number +sidebar_label: row_number +sidebar_position: 4 +--- + +# row_number + +Numbers the current row within its partition starting from 1 + +**Syntax** + +```sql +row_number (column_name) + OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] + [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) +FROM table_name +WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column]) +``` + +For more detail on window function syntax see: [Window Functions - Syntax](./index.md/#syntax). + +**Returned value** + +- A number for the current row within its partition. [UInt64](../data-types/int-uint.md). + +**Example** + +The following example is based on the example provided in the video instructional [Ranking window functions in ClickHouse](https://youtu.be/Yku9mmBYm_4?si=XIMu1jpYucCQEoXA). + +Query: + +```sql +CREATE TABLE salaries +( + `team` String, + `player` String, + `salary` UInt32, + `position` String +) +Engine = Memory; + +INSERT INTO salaries FORMAT Values + ('Port Elizabeth Barbarians', 'Gary Chen', 195000, 'F'), + ('New Coreystad Archdukes', 'Charles Juarez', 190000, 'F'), + ('Port Elizabeth Barbarians', 'Michael Stanley', 150000, 'D'), + ('New Coreystad Archdukes', 'Scott Harrison', 150000, 'D'), + ('Port Elizabeth Barbarians', 'Robert George', 195000, 'M'); +``` + +```sql +SELECT player, salary, + row_number() OVER (ORDER BY salary DESC) AS row_number +FROM salaries; +``` + +Result: + +```response + ┌─player──────────┬─salary─┬─row_number─┐ +1. │ Gary Chen │ 195000 │ 1 │ +2. │ Robert George │ 195000 │ 2 │ +3. │ Charles Juarez │ 190000 │ 3 │ +4. │ Scott Harrison │ 150000 │ 4 │ +5. │ Michael Stanley │ 150000 │ 5 │ + └─────────────────┴────────┴────────────┘ +``` \ No newline at end of file From 9b1003527dc8aba15729a63a86428390470bff07 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 8 Jul 2024 11:34:39 +0200 Subject: [PATCH 20/70] Fix the order --- .../DataLakes/DeltaLakeMetadata.cpp | 75 ++++++++++--------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp index bc64ef15cf1..12341c877e2 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp @@ -209,43 +209,6 @@ struct DeltaLakeMetadataImpl // object->stringify(oss); // LOG_TEST(log, "Metadata: {}", oss.str()); - if (object->has("add")) - { - auto add_object = object->get("add").extract(); - auto path = add_object->getValue("path"); - result.insert(fs::path(configuration->getPath()) / path); - - auto filename = fs::path(path).filename().string(); - auto it = file_partition_columns.find(filename); - if (it == file_partition_columns.end()) - { - if (add_object->has("partitionValues")) - { - auto partition_values = add_object->get("partitionValues").extract(); - if (partition_values->size()) - { - auto & current_partition_columns = file_partition_columns[filename]; - for (const auto & partition_name : partition_values->getNames()) - { - const auto value = partition_values->getValue(partition_name); - auto name_and_type = file_schema.tryGetByName(partition_name); - if (!name_and_type) - throw Exception(ErrorCodes::LOGICAL_ERROR, "No such column in schema: {}", partition_name); - - auto field = getFieldValue(value, name_and_type->type); - current_partition_columns.emplace_back(*name_and_type, field); - - LOG_TEST(log, "Partition {} value is {} (for {})", partition_name, value, filename); - } - } - } - } - } - else if (object->has("remove")) - { - auto path = object->get("remove").extract()->getValue("path"); - result.erase(fs::path(configuration->getPath()) / path); - } if (object->has("metaData")) { const auto metadata_object = object->get("metaData").extract(); @@ -289,6 +252,44 @@ struct DeltaLakeMetadataImpl file_schema.toString(), current_schema.toString()); } } + + if (object->has("add")) + { + auto add_object = object->get("add").extract(); + auto path = add_object->getValue("path"); + result.insert(fs::path(configuration->getPath()) / path); + + auto filename = fs::path(path).filename().string(); + auto it = file_partition_columns.find(filename); + if (it == file_partition_columns.end()) + { + if (add_object->has("partitionValues")) + { + auto partition_values = add_object->get("partitionValues").extract(); + if (partition_values->size()) + { + auto & current_partition_columns = file_partition_columns[filename]; + for (const auto & partition_name : partition_values->getNames()) + { + const auto value = partition_values->getValue(partition_name); + auto name_and_type = file_schema.tryGetByName(partition_name); + if (!name_and_type) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No such column in schema: {}", partition_name); + + auto field = getFieldValue(value, name_and_type->type); + current_partition_columns.emplace_back(*name_and_type, field); + + LOG_TEST(log, "Partition {} value is {} (for {})", partition_name, value, filename); + } + } + } + } + } + else if (object->has("remove")) + { + auto path = object->get("remove").extract()->getValue("path"); + result.erase(fs::path(configuration->getPath()) / path); + } } } From 4227447eac30dd77c6ba70d4b1685bbf11a8221f Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 8 Jul 2024 12:53:55 +0200 Subject: [PATCH 21/70] add nth_value and update ordering --- .../sql-reference/window-functions/index.md | 6 +- .../window-functions/lagInFrame.md | 2 +- .../window-functions/leadInFrame.md | 2 +- .../window-functions/nth_value.md | 77 +++++++++++++++++++ .../en/sql-reference/window-functions/rank.md | 2 +- .../window-functions/row_number.md | 2 +- 6 files changed, 84 insertions(+), 7 deletions(-) create mode 100644 docs/en/sql-reference/window-functions/nth_value.md diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index a0246af610f..ee54a679ba1 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -78,11 +78,11 @@ These functions can be used only as a window function. - [`row_number()`](./row_number.md) - Number the current row within its partition starting from 1. - `first_value(x)` - Return the first non-NULL value evaluated within its ordered frame. - `last_value(x)` - Return the last non-NULL value evaluated within its ordered frame. -- `nth_value(x, offset)` - Return the first non-NULL value evaluated against the nth row (offset) in its ordered frame. +- [`nth_value(x, offset)`](./nth_value.md) - Return the first non-NULL value evaluated against the nth row (offset) in its ordered frame. - [`rank()`](./rank.md) - Rank the current row within its partition with gaps. - [`dense_rank()`](./dense_rank.md) - Rank the current row within its partition without gaps. -- `lagInFrame(x)` - Return a value evaluated at the row that is at a specified physical offset row before the current row within the ordered frame. -- `leadInFrame(x)` - Return a value evaluated at the row that is offset rows after the current row within the ordered frame. +- [`lagInFrame(x)`](./lagInFrame.md) - Return a value evaluated at the row that is at a specified physical offset row before the current row within the ordered frame. +- [`leadInFrame(x)`](./leadInFrame.md) - Return a value evaluated at the row that is offset rows after the current row within the ordered frame. ## Examples diff --git a/docs/en/sql-reference/window-functions/lagInFrame.md b/docs/en/sql-reference/window-functions/lagInFrame.md index ea9f6d9dea2..b67cf252283 100644 --- a/docs/en/sql-reference/window-functions/lagInFrame.md +++ b/docs/en/sql-reference/window-functions/lagInFrame.md @@ -1,7 +1,7 @@ --- slug: /en/sql-reference/window-functions/lagInFrame sidebar_label: lagInFrame -sidebar_position: 5 +sidebar_position: 3 --- # lagInFrame diff --git a/docs/en/sql-reference/window-functions/leadInFrame.md b/docs/en/sql-reference/window-functions/leadInFrame.md index e3b65af9a4d..0cb4eea52b2 100644 --- a/docs/en/sql-reference/window-functions/leadInFrame.md +++ b/docs/en/sql-reference/window-functions/leadInFrame.md @@ -1,7 +1,7 @@ --- slug: /en/sql-reference/window-functions/leadInFrame sidebar_label: leadInFrame -sidebar_position: 6 +sidebar_position: 4 --- # leadInFrame diff --git a/docs/en/sql-reference/window-functions/nth_value.md b/docs/en/sql-reference/window-functions/nth_value.md new file mode 100644 index 00000000000..26c90110aaa --- /dev/null +++ b/docs/en/sql-reference/window-functions/nth_value.md @@ -0,0 +1,77 @@ +--- +slug: /en/sql-reference/window-functions/leadInFrame +sidebar_label: leadInFrame +sidebar_position: 5 +--- + +# nth_value + +Return the first non-NULL value evaluated against the nth row (offset) in its ordered frame. + +The [dense_rank](./dense_rank.md) function provides the same behaviour but without gaps in ranking. + +**Syntax** + +```sql +nth_value (x, offset) + OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] + [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) +FROM table_name +WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column]) +``` + +For more detail on window function syntax see: [Window Functions - Syntax](./index.md/#syntax). + +**Parameters** + +- `x` — Column name. +- `offset` — nth row to evaluate current row against. + +**Returned value** + +- The first non-NULL value evaluated against the nth row (offset) in its ordered frame. + +**Example** + +In this example the `nth-value` function is used to find the third-highest salary from a fictional dataset of salaries of Premier League football players. + +Query: + +```sql +DROP TABLE IF EXISTS salaries; +CREATE TABLE salaries +( + `team` String, + `player` String, + `salary` UInt32, + `position` String +) +Engine = Memory; + +INSERT INTO salaries FORMAT Values + ('Port Elizabeth Barbarians', 'Gary Chen', 195000, 'F'), + ('New Coreystad Archdukes', 'Charles Juarez', 190000, 'F'), + ('Port Elizabeth Barbarians', 'Michael Stanley', 10000, 'D'), + ('New Coreystad Archdukes', 'Scott Harrison', 180000, 'D'), + ('Port Elizabeth Barbarians', 'Robert George', 195000, 'M'), + ('South Hampton Seagulls', 'Douglas Benson', 150000, 'M'), + ('South Hampton Seagulls', 'James Henderson', 140000, 'M'); +``` + +```sql +SELECT salary, nth_value(salary,3) OVER(ORDER BY salary DESC) FROM salaries GROUP BY salary; +``` + +Result: + +```response + ┌─player──────────┬─salary─┬─rank─┐ +1. │ Gary Chen │ 195000 │ 1 │ +2. │ Robert George │ 195000 │ 1 │ +3. │ Charles Juarez │ 190000 │ 3 │ +4. │ Douglas Benson │ 150000 │ 4 │ +5. │ Michael Stanley │ 150000 │ 4 │ +6. │ Scott Harrison │ 150000 │ 4 │ +7. │ James Henderson │ 140000 │ 7 │ + └─────────────────┴────────┴──────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/window-functions/rank.md b/docs/en/sql-reference/window-functions/rank.md index 17db889ef92..9ac99dde6df 100644 --- a/docs/en/sql-reference/window-functions/rank.md +++ b/docs/en/sql-reference/window-functions/rank.md @@ -1,7 +1,7 @@ --- slug: /en/sql-reference/window-functions/rank sidebar_label: rank -sidebar_position: 3 +sidebar_position: 6 --- # rank diff --git a/docs/en/sql-reference/window-functions/row_number.md b/docs/en/sql-reference/window-functions/row_number.md index 428bb34a8ba..e7165d60169 100644 --- a/docs/en/sql-reference/window-functions/row_number.md +++ b/docs/en/sql-reference/window-functions/row_number.md @@ -1,7 +1,7 @@ --- slug: /en/sql-reference/window-functions/row_number sidebar_label: row_number -sidebar_position: 4 +sidebar_position: 7 --- # row_number From 312dd824254a8518b35c9a3bed75f2887edb769e Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 Jul 2024 11:35:00 +0000 Subject: [PATCH 22/70] Bump rocksdb to v6.24.2 --- contrib/rocksdb | 2 +- contrib/rocksdb-cmake/CMakeLists.txt | 112 ++++++------------ ...ksdb_build_version.cc => build_version.cc} | 0 3 files changed, 34 insertions(+), 80 deletions(-) rename contrib/rocksdb-cmake/{rocksdb_build_version.cc => build_version.cc} (100%) diff --git a/contrib/rocksdb b/contrib/rocksdb index 078fa563869..2aed45919b9 160000 --- a/contrib/rocksdb +++ b/contrib/rocksdb @@ -1 +1 @@ -Subproject commit 078fa5638690004e1f744076d1bdcc4e93767304 +Subproject commit 2aed45919b9fee4208221e01f368483fef11be61 diff --git a/contrib/rocksdb-cmake/CMakeLists.txt b/contrib/rocksdb-cmake/CMakeLists.txt index 943e1d8acbd..5502d3b6205 100644 --- a/contrib/rocksdb-cmake/CMakeLists.txt +++ b/contrib/rocksdb-cmake/CMakeLists.txt @@ -5,20 +5,13 @@ if (NOT ENABLE_ROCKSDB) return() endif() -## this file is extracted from `contrib/rocksdb/CMakeLists.txt` -set(ROCKSDB_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/rocksdb") -list(APPEND CMAKE_MODULE_PATH "${ROCKSDB_SOURCE_DIR}/cmake/modules/") - -set(PORTABLE ON) -## always disable jemalloc for rocksdb by default -## because it introduces non-standard jemalloc APIs +# Always disable jemalloc for rocksdb by default because it introduces non-standard jemalloc APIs option(WITH_JEMALLOC "build with JeMalloc" OFF) -set(USE_SNAPPY OFF) -if (TARGET ch_contrib::snappy) - set(USE_SNAPPY ON) -endif() -option(WITH_SNAPPY "build with SNAPPY" ${USE_SNAPPY}) -## lz4, zlib, zstd is enabled in ClickHouse by default + +option(WITH_LIBURING "build with liburing" OFF) # TODO could try to enable this conditionally, depending on ClickHouse's ENABLE_LIBURING + +# ClickHouse cannot be compiled without snappy, lz4, zlib, zstd +option(WITH_SNAPPY "build with SNAPPY" ON) option(WITH_LZ4 "build with lz4" ON) option(WITH_ZLIB "build with zlib" ON) option(WITH_ZSTD "build with zstd" ON) @@ -26,74 +19,34 @@ option(WITH_ZSTD "build with zstd" ON) # third-party/folly is only validated to work on Linux and Windows for now. # So only turn it on there by default. if(CMAKE_SYSTEM_NAME MATCHES "Linux|Windows") - if(MSVC AND MSVC_VERSION LESS 1910) - # Folly does not compile with MSVC older than VS2017 - option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" OFF) - else() - option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" ON) - endif() + option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" ON) else() option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" OFF) endif() -if( NOT DEFINED CMAKE_CXX_STANDARD ) - set(CMAKE_CXX_STANDARD 11) +if(WITH_SNAPPY) + add_definitions(-DSNAPPY) + list(APPEND THIRDPARTY_LIBS ch_contrib::snappy) endif() -if(MSVC) - option(WITH_XPRESS "build with windows built in compression" OFF) - include("${ROCKSDB_SOURCE_DIR}/thirdparty.inc") -else() - if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD" AND NOT CMAKE_SYSTEM_NAME MATCHES "kFreeBSD") - # FreeBSD has jemalloc as default malloc - # but it does not have all the jemalloc files in include/... - set(WITH_JEMALLOC ON) - else() - if(WITH_JEMALLOC AND TARGET ch_contrib::jemalloc) - add_definitions(-DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE) - list(APPEND THIRDPARTY_LIBS ch_contrib::jemalloc) - endif() - endif() - - if(WITH_SNAPPY) - add_definitions(-DSNAPPY) - list(APPEND THIRDPARTY_LIBS ch_contrib::snappy) - endif() - - if(WITH_ZLIB) - add_definitions(-DZLIB) - list(APPEND THIRDPARTY_LIBS ch_contrib::zlib) - endif() - - if(WITH_LZ4) - add_definitions(-DLZ4) - list(APPEND THIRDPARTY_LIBS ch_contrib::lz4) - endif() - - if(WITH_ZSTD) - add_definitions(-DZSTD) - list(APPEND THIRDPARTY_LIBS ch_contrib::zstd) - endif() +if(WITH_ZLIB) + add_definitions(-DZLIB) + list(APPEND THIRDPARTY_LIBS ch_contrib::zlib) endif() -if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") - if(POWER9) - set(HAS_POWER9 1) - set(HAS_ALTIVEC 1) - else() - set(HAS_POWER8 1) - set(HAS_ALTIVEC 1) - endif(POWER9) -endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") +if(WITH_LZ4) + add_definitions(-DLZ4) + list(APPEND THIRDPARTY_LIBS ch_contrib::lz4) +endif() -if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm64|ARM64") - set(HAS_ARMV8_CRC 1) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") -endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm64|ARM64") +if(WITH_ZSTD) + add_definitions(-DZSTD) + list(APPEND THIRDPARTY_LIBS ch_contrib::zstd) +endif() +option(PORTABLE "build a portable binary" ON) -if(ENABLE_AVX2 AND ENABLE_PCLMULQDQ) +if(ENABLE_SSE42 AND ENABLE_PCLMULQDQ) add_definitions(-DHAVE_SSE42) add_definitions(-DHAVE_PCLMUL) endif() @@ -107,8 +60,6 @@ if(CMAKE_SYSTEM_NAME MATCHES "Darwin") add_definitions(-DOS_MACOSX) elseif(CMAKE_SYSTEM_NAME MATCHES "Linux") add_definitions(-DOS_LINUX) -elseif(CMAKE_SYSTEM_NAME MATCHES "SunOS") - add_definitions(-DOS_SOLARIS) elseif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") add_definitions(-DOS_FREEBSD) elseif(CMAKE_SYSTEM_NAME MATCHES "Android") @@ -123,12 +74,10 @@ endif() if (OS_LINUX) add_definitions(-DROCKSDB_SCHED_GETCPU_PRESENT) - add_definitions(-DROCKSDB_AUXV_SYSAUXV_PRESENT) add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT) -elseif (OS_FREEBSD) - add_definitions(-DROCKSDB_AUXV_SYSAUXV_PRESENT) endif() +set(ROCKSDB_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/rocksdb") include_directories(${ROCKSDB_SOURCE_DIR}) include_directories("${ROCKSDB_SOURCE_DIR}/include") @@ -136,8 +85,6 @@ if(WITH_FOLLY_DISTRIBUTED_MUTEX) include_directories("${ROCKSDB_SOURCE_DIR}/third-party/folly") endif() -# Main library source code - set(SOURCES ${ROCKSDB_SOURCE_DIR}/cache/cache.cc ${ROCKSDB_SOURCE_DIR}/cache/cache_entry_roles.cc @@ -333,9 +280,12 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/tools/ldb_tool.cc ${ROCKSDB_SOURCE_DIR}/tools/sst_dump_tool.cc ${ROCKSDB_SOURCE_DIR}/tools/trace_analyzer_tool.cc - ${ROCKSDB_SOURCE_DIR}/trace_replay/trace_replay.cc ${ROCKSDB_SOURCE_DIR}/trace_replay/block_cache_tracer.cc ${ROCKSDB_SOURCE_DIR}/trace_replay/io_tracer.cc + ${ROCKSDB_SOURCE_DIR}/trace_replay/trace_record_handler.cc + ${ROCKSDB_SOURCE_DIR}/trace_replay/trace_record_result.cc + ${ROCKSDB_SOURCE_DIR}/trace_replay/trace_record.cc + ${ROCKSDB_SOURCE_DIR}/trace_replay/trace_replay.cc ${ROCKSDB_SOURCE_DIR}/util/coding.cc ${ROCKSDB_SOURCE_DIR}/util/compaction_job_stats_impl.cc ${ROCKSDB_SOURCE_DIR}/util/comparator.cc @@ -366,6 +316,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/utilities/cassandra/format.cc ${ROCKSDB_SOURCE_DIR}/utilities/cassandra/merge_operator.cc ${ROCKSDB_SOURCE_DIR}/utilities/checkpoint/checkpoint_impl.cc + ${ROCKSDB_SOURCE_DIR}/utilities/compaction_filters.cc ${ROCKSDB_SOURCE_DIR}/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc ${ROCKSDB_SOURCE_DIR}/utilities/debug.cc ${ROCKSDB_SOURCE_DIR}/utilities/env_mirror.cc @@ -374,6 +325,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/utilities/fault_injection_fs.cc ${ROCKSDB_SOURCE_DIR}/utilities/leveldb_options/leveldb_options.cc ${ROCKSDB_SOURCE_DIR}/utilities/memory/memory_util.cc + ${ROCKSDB_SOURCE_DIR}/utilities/merge_operators.cc ${ROCKSDB_SOURCE_DIR}/utilities/merge_operators/bytesxor.cc ${ROCKSDB_SOURCE_DIR}/utilities/merge_operators/max.cc ${ROCKSDB_SOURCE_DIR}/utilities/merge_operators/put.cc @@ -393,6 +345,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/utilities/simulator_cache/sim_cache.cc ${ROCKSDB_SOURCE_DIR}/utilities/table_properties_collectors/compact_on_deletion_collector.cc ${ROCKSDB_SOURCE_DIR}/utilities/trace/file_trace_reader_writer.cc + ${ROCKSDB_SOURCE_DIR}/utilities/trace/replayer_impl.cc ${ROCKSDB_SOURCE_DIR}/utilities/transactions/lock/lock_manager.cc ${ROCKSDB_SOURCE_DIR}/utilities/transactions/lock/point/point_lock_tracker.cc ${ROCKSDB_SOURCE_DIR}/utilities/transactions/lock/point/point_lock_manager.cc @@ -425,7 +378,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc ${ROCKSDB_SOURCE_DIR}/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc ${ROCKSDB_SOURCE_DIR}/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc - rocksdb_build_version.cc) + build_version.cc) # generated by hand if(ENABLE_SSE42 AND ENABLE_PCLMULQDQ) set_source_files_properties( @@ -462,5 +415,6 @@ endif() add_library(_rocksdb ${SOURCES}) add_library(ch_contrib::rocksdb ALIAS _rocksdb) target_link_libraries(_rocksdb PRIVATE ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) + # SYSTEM is required to overcome some issues target_include_directories(_rocksdb SYSTEM BEFORE INTERFACE "${ROCKSDB_SOURCE_DIR}/include") diff --git a/contrib/rocksdb-cmake/rocksdb_build_version.cc b/contrib/rocksdb-cmake/build_version.cc similarity index 100% rename from contrib/rocksdb-cmake/rocksdb_build_version.cc rename to contrib/rocksdb-cmake/build_version.cc From adcee80b2d637d9d79a3ecfe4501828339efe050 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 Jul 2024 13:03:19 +0000 Subject: [PATCH 23/70] Bump rocksdb to v6.25.3 --- contrib/rocksdb | 2 +- contrib/rocksdb-cmake/CMakeLists.txt | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/contrib/rocksdb b/contrib/rocksdb index 2aed45919b9..6df587a7eb3 160000 --- a/contrib/rocksdb +++ b/contrib/rocksdb @@ -1 +1 @@ -Subproject commit 2aed45919b9fee4208221e01f368483fef11be61 +Subproject commit 6df587a7eb3e7bb835a71c2f4a668a51cabefd67 diff --git a/contrib/rocksdb-cmake/CMakeLists.txt b/contrib/rocksdb-cmake/CMakeLists.txt index 5502d3b6205..48c97257d94 100644 --- a/contrib/rocksdb-cmake/CMakeLists.txt +++ b/contrib/rocksdb-cmake/CMakeLists.txt @@ -88,6 +88,7 @@ endif() set(SOURCES ${ROCKSDB_SOURCE_DIR}/cache/cache.cc ${ROCKSDB_SOURCE_DIR}/cache/cache_entry_roles.cc + ${ROCKSDB_SOURCE_DIR}/cache/cache_reservation_manager.cc ${ROCKSDB_SOURCE_DIR}/cache/clock_cache.cc ${ROCKSDB_SOURCE_DIR}/cache/lru_cache.cc ${ROCKSDB_SOURCE_DIR}/cache/sharded_cache.cc @@ -176,6 +177,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/env/file_system_tracer.cc ${ROCKSDB_SOURCE_DIR}/env/fs_remap.cc ${ROCKSDB_SOURCE_DIR}/env/mock_env.cc + ${ROCKSDB_SOURCE_DIR}/env/unique_id.cc ${ROCKSDB_SOURCE_DIR}/file/delete_scheduler.cc ${ROCKSDB_SOURCE_DIR}/file/file_prefetch_buffer.cc ${ROCKSDB_SOURCE_DIR}/file/file_util.cc @@ -297,6 +299,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/util/murmurhash.cc ${ROCKSDB_SOURCE_DIR}/util/random.cc ${ROCKSDB_SOURCE_DIR}/util/rate_limiter.cc + ${ROCKSDB_SOURCE_DIR}/util/regex.cc ${ROCKSDB_SOURCE_DIR}/util/ribbon_config.cc ${ROCKSDB_SOURCE_DIR}/util/slice.cc ${ROCKSDB_SOURCE_DIR}/util/file_checksum_helper.cc From 31f9bed44285eeaae964f98e5a4ce3150d57ac5f Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 Jul 2024 13:43:28 +0000 Subject: [PATCH 24/70] Bump rocksdb to 6.26.1 --- contrib/rocksdb | 2 +- contrib/rocksdb-cmake/CMakeLists.txt | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/contrib/rocksdb b/contrib/rocksdb index 6df587a7eb3..19ab8db7a73 160000 --- a/contrib/rocksdb +++ b/contrib/rocksdb @@ -1 +1 @@ -Subproject commit 6df587a7eb3e7bb835a71c2f4a668a51cabefd67 +Subproject commit 19ab8db7a736306d6d12992a21e545e0336ab34a diff --git a/contrib/rocksdb-cmake/CMakeLists.txt b/contrib/rocksdb-cmake/CMakeLists.txt index 48c97257d94..525b301f31f 100644 --- a/contrib/rocksdb-cmake/CMakeLists.txt +++ b/contrib/rocksdb-cmake/CMakeLists.txt @@ -177,7 +177,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/env/file_system_tracer.cc ${ROCKSDB_SOURCE_DIR}/env/fs_remap.cc ${ROCKSDB_SOURCE_DIR}/env/mock_env.cc - ${ROCKSDB_SOURCE_DIR}/env/unique_id.cc + ${ROCKSDB_SOURCE_DIR}/env/unique_id_gen.cc ${ROCKSDB_SOURCE_DIR}/file/delete_scheduler.cc ${ROCKSDB_SOURCE_DIR}/file/file_prefetch_buffer.cc ${ROCKSDB_SOURCE_DIR}/file/file_util.cc @@ -271,6 +271,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/table/table_factory.cc ${ROCKSDB_SOURCE_DIR}/table/table_properties.cc ${ROCKSDB_SOURCE_DIR}/table/two_level_iterator.cc + ${ROCKSDB_SOURCE_DIR}/table/unique_id.cc ${ROCKSDB_SOURCE_DIR}/test_util/sync_point.cc ${ROCKSDB_SOURCE_DIR}/test_util/sync_point_impl.cc ${ROCKSDB_SOURCE_DIR}/test_util/testutil.cc @@ -315,6 +316,8 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/utilities/blob_db/blob_db_impl_filesnapshot.cc ${ROCKSDB_SOURCE_DIR}/utilities/blob_db/blob_dump_tool.cc ${ROCKSDB_SOURCE_DIR}/utilities/blob_db/blob_file.cc + ${ROCKSDB_SOURCE_DIR}/utilities/cache_dump_load.cc + ${ROCKSDB_SOURCE_DIR}/utilities/cache_dump_load_impl.cc ${ROCKSDB_SOURCE_DIR}/utilities/cassandra/cassandra_compaction_filter.cc ${ROCKSDB_SOURCE_DIR}/utilities/cassandra/format.cc ${ROCKSDB_SOURCE_DIR}/utilities/cassandra/merge_operator.cc @@ -367,6 +370,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/utilities/transactions/write_unprepared_txn.cc ${ROCKSDB_SOURCE_DIR}/utilities/transactions/write_unprepared_txn_db.cc ${ROCKSDB_SOURCE_DIR}/utilities/ttl/db_ttl_impl.cc + ${ROCKSDB_SOURCE_DIR}/utilities/wal_filter.cc ${ROCKSDB_SOURCE_DIR}/utilities/write_batch_with_index/write_batch_with_index.cc ${ROCKSDB_SOURCE_DIR}/utilities/write_batch_with_index/write_batch_with_index_internal.cc ${ROCKSDB_SOURCE_DIR}/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc From 9509802866206df50b7802ab74a556e9fd979852 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 Jul 2024 13:53:24 +0000 Subject: [PATCH 25/70] Bump rocksdb to v6.27.3 --- contrib/rocksdb | 2 +- contrib/rocksdb-cmake/CMakeLists.txt | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/rocksdb b/contrib/rocksdb index 19ab8db7a73..630bc2d1c3b 160000 --- a/contrib/rocksdb +++ b/contrib/rocksdb @@ -1 +1 @@ -Subproject commit 19ab8db7a736306d6d12992a21e545e0336ab34a +Subproject commit 630bc2d1c3bcf654ebada4d7a092996de8cfb779 diff --git a/contrib/rocksdb-cmake/CMakeLists.txt b/contrib/rocksdb-cmake/CMakeLists.txt index 525b301f31f..d6e2a1afd50 100644 --- a/contrib/rocksdb-cmake/CMakeLists.txt +++ b/contrib/rocksdb-cmake/CMakeLists.txt @@ -104,6 +104,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/db/blob/blob_log_format.cc ${ROCKSDB_SOURCE_DIR}/db/blob/blob_log_sequential_reader.cc ${ROCKSDB_SOURCE_DIR}/db/blob/blob_log_writer.cc + ${ROCKSDB_SOURCE_DIR}/db/blob/prefetch_buffer_collection.cc ${ROCKSDB_SOURCE_DIR}/db/builder.cc ${ROCKSDB_SOURCE_DIR}/db/c.cc ${ROCKSDB_SOURCE_DIR}/db/column_family.cc @@ -329,6 +330,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/utilities/env_timed.cc ${ROCKSDB_SOURCE_DIR}/utilities/fault_injection_env.cc ${ROCKSDB_SOURCE_DIR}/utilities/fault_injection_fs.cc + ${ROCKSDB_SOURCE_DIR}/utilities/fault_injection_secondary_cache.cc ${ROCKSDB_SOURCE_DIR}/utilities/leveldb_options/leveldb_options.cc ${ROCKSDB_SOURCE_DIR}/utilities/memory/memory_util.cc ${ROCKSDB_SOURCE_DIR}/utilities/merge_operators.cc From 1bfafa42be11ad338a76746f695ffa1710d198d1 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 Jul 2024 14:38:51 +0000 Subject: [PATCH 26/70] Bump rocksdb to v6.28.2 --- contrib/rocksdb | 2 +- contrib/rocksdb-cmake/CMakeLists.txt | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/rocksdb b/contrib/rocksdb index 630bc2d1c3b..b8a996ce196 160000 --- a/contrib/rocksdb +++ b/contrib/rocksdb @@ -1 +1 @@ -Subproject commit 630bc2d1c3bcf654ebada4d7a092996de8cfb779 +Subproject commit b8a996ce1969a3f7141aca7fb5c54196a58a654a diff --git a/contrib/rocksdb-cmake/CMakeLists.txt b/contrib/rocksdb-cmake/CMakeLists.txt index d6e2a1afd50..96558b40174 100644 --- a/contrib/rocksdb-cmake/CMakeLists.txt +++ b/contrib/rocksdb-cmake/CMakeLists.txt @@ -88,6 +88,7 @@ endif() set(SOURCES ${ROCKSDB_SOURCE_DIR}/cache/cache.cc ${ROCKSDB_SOURCE_DIR}/cache/cache_entry_roles.cc + ${ROCKSDB_SOURCE_DIR}/cache/cache_key.cc ${ROCKSDB_SOURCE_DIR}/cache/cache_reservation_manager.cc ${ROCKSDB_SOURCE_DIR}/cache/clock_cache.cc ${ROCKSDB_SOURCE_DIR}/cache/lru_cache.cc @@ -197,6 +198,7 @@ set(SOURCES ${ROCKSDB_SOURCE_DIR}/memory/concurrent_arena.cc ${ROCKSDB_SOURCE_DIR}/memory/jemalloc_nodump_allocator.cc ${ROCKSDB_SOURCE_DIR}/memory/memkind_kmem_allocator.cc + ${ROCKSDB_SOURCE_DIR}/memory/memory_allocator.cc ${ROCKSDB_SOURCE_DIR}/memtable/alloc_tracker.cc ${ROCKSDB_SOURCE_DIR}/memtable/hash_linklist_rep.cc ${ROCKSDB_SOURCE_DIR}/memtable/hash_skiplist_rep.cc From 9ba10ca604ad6705ad46a60b9d03569c4729afcc Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 30 Jun 2024 17:22:42 +0200 Subject: [PATCH 27/70] Remove mysqlxx::Pool::Entry assignment operator v2: fix tidy https://s3.amazonaws.com/clickhouse-builds/PRs/65920/86789491be1a945602f6ebf0b3b93bf5272e52ab/binary_tidy/build_log.log Signed-off-by: Azat Khuzhin --- src/Common/mysqlxx/Pool.cpp | 1 - src/Common/mysqlxx/mysqlxx/Pool.h | 11 ----------- src/Common/mysqlxx/tests/mysqlxx_pool_test.cpp | 4 +--- src/Databases/MySQL/MaterializedMySQLSyncThread.cpp | 12 +++++++----- 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/src/Common/mysqlxx/Pool.cpp b/src/Common/mysqlxx/Pool.cpp index cc5b18214c8..546e9e91dc7 100644 --- a/src/Common/mysqlxx/Pool.cpp +++ b/src/Common/mysqlxx/Pool.cpp @@ -228,7 +228,6 @@ Pool::Entry Pool::tryGet() for (auto connection_it = connections.cbegin(); connection_it != connections.cend();) { Connection * connection_ptr = *connection_it; - /// Fixme: There is a race condition here b/c we do not synchronize with Pool::Entry's copy-assignment operator if (connection_ptr->ref_count == 0) { { diff --git a/src/Common/mysqlxx/mysqlxx/Pool.h b/src/Common/mysqlxx/mysqlxx/Pool.h index 6e509d8bdd6..f1ef81e28dd 100644 --- a/src/Common/mysqlxx/mysqlxx/Pool.h +++ b/src/Common/mysqlxx/mysqlxx/Pool.h @@ -64,17 +64,6 @@ public: decrementRefCount(); } - Entry & operator= (const Entry & src) /// NOLINT - { - pool = src.pool; - if (data) - decrementRefCount(); - data = src.data; - if (data) - incrementRefCount(); - return * this; - } - bool isNull() const { return data == nullptr; diff --git a/src/Common/mysqlxx/tests/mysqlxx_pool_test.cpp b/src/Common/mysqlxx/tests/mysqlxx_pool_test.cpp index 61d6a117285..121767edc84 100644 --- a/src/Common/mysqlxx/tests/mysqlxx_pool_test.cpp +++ b/src/Common/mysqlxx/tests/mysqlxx_pool_test.cpp @@ -13,13 +13,11 @@ mysqlxx::Pool::Entry getWithFailover(mysqlxx::Pool & connections_pool) constexpr size_t max_tries = 3; - mysqlxx::Pool::Entry worker_connection; - for (size_t try_no = 1; try_no <= max_tries; ++try_no) { try { - worker_connection = connections_pool.tryGet(); + mysqlxx::Pool::Entry worker_connection = connections_pool.tryGet(); if (!worker_connection.isNull()) { diff --git a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp index 7ab4235feeb..27ebe0b6d21 100644 --- a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp +++ b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -532,13 +533,17 @@ static inline void dumpDataForTables( bool MaterializedMySQLSyncThread::prepareSynchronized(MaterializeMetadata & metadata) { bool opened_transaction = false; - mysqlxx::PoolWithFailover::Entry connection; while (!isCancelled()) { try { - connection = pool.tryGet(); + mysqlxx::PoolWithFailover::Entry connection = pool.tryGet(); + SCOPE_EXIT({ + if (opened_transaction) + connection->query("ROLLBACK").execute(); + }); + if (connection.isNull()) { if (settings->max_wait_time_when_mysql_unavailable < 0) @@ -602,9 +607,6 @@ bool MaterializedMySQLSyncThread::prepareSynchronized(MaterializeMetadata & meta { tryLogCurrentException(log); - if (opened_transaction) - connection->query("ROLLBACK").execute(); - if (settings->max_wait_time_when_mysql_unavailable < 0) throw; From bd42a096b7e023195316d798188adfe0a98555f5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 8 Jul 2024 16:52:36 +0000 Subject: [PATCH 28/70] Bump rocksdb to v6.29.5 --- contrib/rocksdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/rocksdb b/contrib/rocksdb index b8a996ce196..be366233921 160000 --- a/contrib/rocksdb +++ b/contrib/rocksdb @@ -1 +1 @@ -Subproject commit b8a996ce1969a3f7141aca7fb5c54196a58a654a +Subproject commit be366233921293bd07a84dc4ea6991858665f202 From 9153e65456dd3a90d9be85c1a8a52592ce054e77 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 9 Jul 2024 10:15:41 +0200 Subject: [PATCH 29/70] Remove unneded include --- src/DataTypes/DataTypeDynamic.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/DataTypes/DataTypeDynamic.cpp b/src/DataTypes/DataTypeDynamic.cpp index 6826c46a1a7..c920e69c13b 100644 --- a/src/DataTypes/DataTypeDynamic.cpp +++ b/src/DataTypes/DataTypeDynamic.cpp @@ -12,7 +12,6 @@ #include #include #include -#include namespace DB { From 6baa52d10176369fefc6249dbb256a26eb7b1bdc Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:01:28 +0200 Subject: [PATCH 30/70] Fix null insertion into dynamic column --- src/Formats/JSONExtractTree.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/Formats/JSONExtractTree.cpp b/src/Formats/JSONExtractTree.cpp index 8fe472930d3..9efb1392583 100644 --- a/src/Formats/JSONExtractTree.cpp +++ b/src/Formats/JSONExtractTree.cpp @@ -1265,9 +1265,16 @@ public: bool insertResultToColumn(IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override { auto & column_dynamic = assert_cast(column); + /// First, check if element is NULL. + if (element.isNull()) + { + column_dynamic.insertDefault(); + return true; + } + auto & variant_column = column_dynamic.getVariantColumn(); auto variant_info = column_dynamic.getVariantInfo(); - /// First, infer ClickHouse type for this element and add it as a new variant. + /// Second, infer ClickHouse type for this element and add it as a new variant. auto element_type = elementToDataType(element, format_settings); if (column_dynamic.addNewVariant(element_type)) { From 61f827b5698754af84e7d75a2b83bd0191139820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Tue, 9 Jul 2024 15:26:33 +0200 Subject: [PATCH 31/70] Update src/Databases/DatabaseAtomic.cpp --- src/Databases/DatabaseAtomic.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index a48eb2abce6..5b816e4f282 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -106,8 +106,8 @@ void DatabaseAtomic::attachTable(ContextPtr /* context_ */, const String & name, StoragePtr DatabaseAtomic::detachTable(ContextPtr /* context */, const String & name) { - // it is important to call destructures not_in_use without - // blocking mutex for avoid potential deadlock. + // it is important to call the destructors of not_in_use without + // locked mutex to avoid potential deadlock. DetachedTables not_in_use; StoragePtr table; { From 8a202d91ad745089adaff4ebf2cde5e6754503ce Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 9 Jul 2024 16:24:35 +0200 Subject: [PATCH 32/70] Properly read schema and partition columns from checkpoint file --- .../DataLakes/DeltaLakeMetadata.cpp | 169 ++++++++++++------ .../DataLakes/IStorageDataLake.h | 10 +- .../StorageObjectStorageSource.cpp | 28 +-- tests/integration/test_storage_delta/test.py | 129 +++++++++++-- 4 files changed, 255 insertions(+), 81 deletions(-) diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp index 12341c877e2..d37bffc42c4 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include @@ -111,7 +113,7 @@ struct DeltaLakeMetadataImpl std::set result_files; NamesAndTypesList current_schema; DataLakePartitionColumns current_partition_columns; - const auto checkpoint_version = getCheckpointIfExists(result_files); + const auto checkpoint_version = getCheckpointIfExists(result_files, current_schema, current_partition_columns); if (checkpoint_version) { @@ -205,9 +207,9 @@ struct DeltaLakeMetadataImpl Poco::Dynamic::Var json = parser.parse(json_str); Poco::JSON::Object::Ptr object = json.extract(); - // std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM - // object->stringify(oss); - // LOG_TEST(log, "Metadata: {}", oss.str()); + std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + object->stringify(oss); + LOG_TEST(log, "Metadata: {}", oss.str()); if (object->has("metaData")) { @@ -216,30 +218,9 @@ struct DeltaLakeMetadataImpl Poco::JSON::Parser p; Poco::Dynamic::Var fields_json = parser.parse(schema_object); - Poco::JSON::Object::Ptr fields_object = fields_json.extract(); - - const auto fields = fields_object->get("fields").extract(); - NamesAndTypesList current_schema; - for (size_t i = 0; i < fields->size(); ++i) - { - const auto field = fields->getObject(static_cast(i)); - auto column_name = field->getValue("name"); - auto type = field->getValue("type"); - auto is_nullable = field->getValue("nullable"); - - std::string physical_name; - auto schema_metadata_object = field->get("metadata").extract(); - if (schema_metadata_object->has("delta.columnMapping.physicalName")) - physical_name = schema_metadata_object->getValue("delta.columnMapping.physicalName"); - else - physical_name = column_name; - - LOG_TEST(log, "Found column: {}, type: {}, nullable: {}, physical name: {}", - column_name, type, is_nullable, physical_name); - - current_schema.push_back({physical_name, getFieldType(field, "type", is_nullable)}); - } + const Poco::JSON::Object::Ptr & fields_object = fields_json.extract(); + auto current_schema = parseMetadata(fields_object); if (file_schema.empty()) { file_schema = current_schema; @@ -274,7 +255,12 @@ struct DeltaLakeMetadataImpl const auto value = partition_values->getValue(partition_name); auto name_and_type = file_schema.tryGetByName(partition_name); if (!name_and_type) - throw Exception(ErrorCodes::LOGICAL_ERROR, "No such column in schema: {}", partition_name); + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "No such column in schema: {} (schema: {})", + partition_name, file_schema.toNamesAndTypesDescription()); + } auto field = getFieldValue(value, name_and_type->type); current_partition_columns.emplace_back(*name_and_type, field); @@ -293,6 +279,32 @@ struct DeltaLakeMetadataImpl } } + NamesAndTypesList parseMetadata(const Poco::JSON::Object::Ptr & metadata_json) + { + NamesAndTypesList schema; + const auto fields = metadata_json->get("fields").extract(); + for (size_t i = 0; i < fields->size(); ++i) + { + const auto field = fields->getObject(static_cast(i)); + auto column_name = field->getValue("name"); + auto type = field->getValue("type"); + auto is_nullable = field->getValue("nullable"); + + std::string physical_name; + auto schema_metadata_object = field->get("metadata").extract(); + if (schema_metadata_object->has("delta.columnMapping.physicalName")) + physical_name = schema_metadata_object->getValue("delta.columnMapping.physicalName"); + else + physical_name = column_name; + + LOG_TEST(log, "Found column: {}, type: {}, nullable: {}, physical name: {}", + column_name, type, is_nullable, physical_name); + + schema.push_back({physical_name, getFieldType(field, "type", is_nullable)}); + } + return schema; + } + DataTypePtr getFieldType(const Poco::JSON::Object::Ptr & field, const String & type_key, bool is_nullable) { if (field->isObject(type_key)) @@ -506,7 +518,10 @@ struct DeltaLakeMetadataImpl throw Exception(ErrorCodes::BAD_ARGUMENTS, "Arrow error: {}", _s.ToString()); \ } while (false) - size_t getCheckpointIfExists(std::set & result) + size_t getCheckpointIfExists( + std::set & result, + NamesAndTypesList & file_schema, + DataLakePartitionColumns & file_partition_columns) { const auto version = readLastCheckpointIfExists(); if (!version) @@ -527,7 +542,8 @@ struct DeltaLakeMetadataImpl auto columns = ParquetSchemaReader(*buf, format_settings).readSchema(); /// Read only columns that we need. - columns.filterColumns(NameSet{"add", "remove"}); + auto filter_column_names = NameSet{"add", "metaData"}; + columns.filterColumns(filter_column_names); Block header; for (const auto & column : columns) header.insert({column.type->createColumn(), column.type, column.name}); @@ -541,9 +557,6 @@ struct DeltaLakeMetadataImpl ArrowMemoryPool::instance(), &reader)); - std::shared_ptr file_schema; - THROW_ARROW_NOT_OK(reader->GetSchema(&file_schema)); - ArrowColumnToCHColumn column_reader( header, "Parquet", format_settings.parquet.allow_missing_columns, @@ -554,29 +567,85 @@ struct DeltaLakeMetadataImpl std::shared_ptr table; THROW_ARROW_NOT_OK(reader->ReadTable(&table)); - Chunk res = column_reader.arrowTableToCHChunk(table, reader->parquet_reader()->metadata()->num_rows()); - const auto & res_columns = res.getColumns(); + Chunk chunk = column_reader.arrowTableToCHChunk(table, reader->parquet_reader()->metadata()->num_rows()); + auto res_block = header.cloneWithColumns(chunk.detachColumns()); + res_block = Nested::flatten(res_block); - if (res_columns.size() != 2) - { - throw Exception( - ErrorCodes::INCORRECT_DATA, - "Unexpected number of columns: {} (having: {}, expected: {})", - res_columns.size(), res.dumpStructure(), header.dumpStructure()); - } + const auto * nullable_path_column = assert_cast(res_block.getByName("add.path").column.get()); + const auto & path_column = assert_cast(nullable_path_column->getNestedColumn()); + + const auto * nullable_schema_column = assert_cast(res_block.getByName("metaData.schemaString").column.get()); + const auto & schema_column = assert_cast(nullable_schema_column->getNestedColumn()); + + auto partition_values_column_raw = res_block.getByName("add.partitionValues").column; + const auto & partition_values_column = assert_cast(*partition_values_column_raw); - const auto * tuple_column = assert_cast(res_columns[0].get()); - const auto & nullable_column = assert_cast(tuple_column->getColumn(0)); - const auto & path_column = assert_cast(nullable_column.getNestedColumn()); for (size_t i = 0; i < path_column.size(); ++i) { - const auto filename = String(path_column.getDataAt(i)); - if (filename.empty()) + const auto metadata = String(schema_column.getDataAt(i)); + if (!metadata.empty()) + { + Poco::JSON::Parser parser; + Poco::Dynamic::Var json = parser.parse(metadata); + const Poco::JSON::Object::Ptr & object = json.extract(); + + auto current_schema = parseMetadata(object); + if (file_schema.empty()) + { + file_schema = current_schema; + LOG_TEST(log, "Processed schema from checkpoint: {}", file_schema.toString()); + } + else if (file_schema != current_schema) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Reading from files with different schema is not possible " + "({} is different from {})", + file_schema.toString(), current_schema.toString()); + } + } + } + + for (size_t i = 0; i < path_column.size(); ++i) + { + const auto path = String(path_column.getDataAt(i)); + if (path.empty()) continue; - LOG_TEST(log, "Adding {}", filename); - const auto [_, inserted] = result.insert(std::filesystem::path(configuration->getPath()) / filename); + + auto filename = fs::path(path).filename().string(); + auto it = file_partition_columns.find(filename); + if (it == file_partition_columns.end()) + { + Field map; + partition_values_column.get(i, map); + auto partition_values_map = map.safeGet(); + if (!partition_values_map.empty()) + { + auto & current_partition_columns = file_partition_columns[filename]; + for (const auto & map_value : partition_values_map) + { + const auto tuple = map_value.safeGet(); + const auto partition_name = tuple[0].safeGet(); + auto name_and_type = file_schema.tryGetByName(partition_name); + if (!name_and_type) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "No such column in schema: {} (schema: {})", + partition_name, file_schema.toString()); + } + const auto value = tuple[1].safeGet(); + auto field = getFieldValue(value, name_and_type->type); + current_partition_columns.emplace_back(*name_and_type, field); + + LOG_TEST(log, "Partition {} value is {} (for {})", partition_name, value, filename); + } + } + } + + LOG_TEST(log, "Adding {}", path); + const auto [_, inserted] = result.insert(std::filesystem::path(configuration->getPath()) / path); if (!inserted) - throw Exception(ErrorCodes::INCORRECT_DATA, "File already exists {}", filename); + throw Exception(ErrorCodes::INCORRECT_DATA, "File already exists {}", path); } return version; diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index f1217bc9729..d6935c706d9 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -17,6 +17,10 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} /// Storage for read-only integration with Apache Iceberg tables in Amazon S3 (see https://iceberg.apache.org/) /// Right now it's implemented on top of StorageS3 and right now it doesn't support @@ -41,6 +45,7 @@ public: auto object_storage = base_configuration->createObjectStorage(context, /* is_readonly */true); DataLakeMetadataPtr metadata; NamesAndTypesList schema_from_metadata; + const bool use_schema_from_metadata = columns_.empty(); if (base_configuration->format == "auto") base_configuration->format = "Parquet"; @@ -50,8 +55,9 @@ public: try { metadata = DataLakeMetadata::create(object_storage, base_configuration, context); - schema_from_metadata = metadata->getTableSchema(); configuration->setPaths(metadata->getDataFiles()); + if (use_schema_from_metadata) + schema_from_metadata = metadata->getTableSchema(); } catch (...) { @@ -66,7 +72,7 @@ public: return std::make_shared>( base_configuration, std::move(metadata), configuration, object_storage, context, table_id_, - columns_.empty() ? ColumnsDescription(schema_from_metadata) : columns_, + use_schema_from_metadata ? ColumnsDescription(schema_from_metadata) : columns_, constraints_, comment_, format_settings_); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 6940f10cb91..a9a7e062076 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -206,23 +206,25 @@ Chunk StorageObjectStorageSource::generate() if (!partition_columns.empty() && chunk_size && chunk.hasColumns()) { auto partition_values = partition_columns.find(filename); - - for (const auto & [name_and_type, value] : partition_values->second) + if (partition_values != partition_columns.end()) { - if (!read_from_format_info.source_header.has(name_and_type.name)) - continue; + for (const auto & [name_and_type, value] : partition_values->second) + { + if (!read_from_format_info.source_header.has(name_and_type.name)) + continue; - const auto column_pos = read_from_format_info.source_header.getPositionByName(name_and_type.name); - auto partition_column = name_and_type.type->createColumnConst(chunk.getNumRows(), value)->convertToFullColumnIfConst(); + const auto column_pos = read_from_format_info.source_header.getPositionByName(name_and_type.name); + auto partition_column = name_and_type.type->createColumnConst(chunk.getNumRows(), value)->convertToFullColumnIfConst(); - /// This column is filled with default value now, remove it. - chunk.erase(column_pos); + /// This column is filled with default value now, remove it. + chunk.erase(column_pos); - /// Add correct values. - if (chunk.hasColumns()) - chunk.addColumn(column_pos, std::move(partition_column)); - else - chunk.addColumn(std::move(partition_column)); + /// Add correct values. + if (column_pos < chunk.getNumColumns()) + chunk.addColumn(column_pos, std::move(partition_column)); + else + chunk.addColumn(std::move(partition_column)); + } } } return chunk; diff --git a/tests/integration/test_storage_delta/test.py b/tests/integration/test_storage_delta/test.py index 4cb71895881..d3dd7cfe52a 100644 --- a/tests/integration/test_storage_delta/test.py +++ b/tests/integration/test_storage_delta/test.py @@ -596,19 +596,116 @@ def test_partition_columns(started_cluster): ) assert result == 1 - # instance.query( - # f""" - # DROP TABLE IF EXISTS {TABLE_NAME}; - # CREATE TABLE {TABLE_NAME} (a Int32, b String, c DateTime) - # ENGINE=DeltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{result_file}/', 'minio', 'minio123')""" - # ) - # assert ( - # int( - # instance.query( - # f"SELECT count() FROM {TABLE_NAME} WHERE c != toDateTime('2000/01/05')" - # ) - # ) - # == num_rows - 1 - # ) - # instance.query(f"SELECT a, b, c, FROM {TABLE_NAME}") - # assert False + instance.query( + f""" + DROP TABLE IF EXISTS {TABLE_NAME}; + CREATE TABLE {TABLE_NAME} (a Nullable(Int32), b Nullable(String), c Nullable(Date32), d Nullable(Int32), e Nullable(Bool)) + ENGINE=DeltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{result_file}/', 'minio', 'minio123')""" + ) + assert ( + """1 test1 2000-01-01 1 false +2 test2 2000-01-02 2 false +3 test3 2000-01-03 3 false +4 test4 2000-01-04 4 false +5 test5 2000-01-05 5 false +6 test6 2000-01-06 6 false +7 test7 2000-01-07 7 false +8 test8 2000-01-08 8 false +9 test9 2000-01-09 9 false""" + == instance.query(f"SELECT * FROM {TABLE_NAME} ORDER BY b").strip() + ) + + assert ( + int( + instance.query( + f"SELECT count() FROM {TABLE_NAME} WHERE c == toDateTime('2000/01/05')" + ) + ) + == 1 + ) + + # Subset of columns should work. + instance.query( + f""" + DROP TABLE IF EXISTS {TABLE_NAME}; + CREATE TABLE {TABLE_NAME} (b Nullable(String), c Nullable(Date32), d Nullable(Int32)) + ENGINE=DeltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{result_file}/', 'minio', 'minio123')""" + ) + assert ( + """test1 2000-01-01 1 +test2 2000-01-02 2 +test3 2000-01-03 3 +test4 2000-01-04 4 +test5 2000-01-05 5 +test6 2000-01-06 6 +test7 2000-01-07 7 +test8 2000-01-08 8 +test9 2000-01-09 9""" + == instance.query(f"SELECT * FROM {TABLE_NAME} ORDER BY b").strip() + ) + + for i in range(num_rows + 1, 2 * num_rows + 1): + data = [ + ( + i, + "test" + str(i), + datetime.strptime(f"2000-01-{i}", "%Y-%m-%d"), + i, + False, + ) + ] + df = spark.createDataFrame(data=data, schema=schema) + df.printSchema() + df.write.mode("append").format("delta").partitionBy(partition_columns).save( + f"/{TABLE_NAME}" + ) + + files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "") + ok = False + for file in files: + if file.endswith("last_checkpoint"): + ok = True + assert ok + + result = int( + instance.query( + f"""SELECT count() + FROM deltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{result_file}/', 'minio', 'minio123') + """ + ) + ) + assert result == num_rows * 2 + + assert ( + """1 test1 2000-01-01 1 false +2 test2 2000-01-02 2 false +3 test3 2000-01-03 3 false +4 test4 2000-01-04 4 false +5 test5 2000-01-05 5 false +6 test6 2000-01-06 6 false +7 test7 2000-01-07 7 false +8 test8 2000-01-08 8 false +9 test9 2000-01-09 9 false +10 test10 2000-01-10 10 false +11 test11 2000-01-11 11 false +12 test12 2000-01-12 12 false +13 test13 2000-01-13 13 false +14 test14 2000-01-14 14 false +15 test15 2000-01-15 15 false +16 test16 2000-01-16 16 false +17 test17 2000-01-17 17 false +18 test18 2000-01-18 18 false""" + == instance.query( + f""" +SELECT * FROM deltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{result_file}/', 'minio', 'minio123') ORDER BY c + """ + ).strip() + ) + assert ( + int( + instance.query( + f"SELECT count() FROM {TABLE_NAME} WHERE c == toDateTime('2000/01/15')" + ) + ) + == 1 + ) From 55468caeaee9c7c5074897d50f5aa2c4fe4d584e Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 9 Jul 2024 16:38:22 +0000 Subject: [PATCH 33/70] Fix ARM build --- contrib/rocksdb-cmake/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/contrib/rocksdb-cmake/CMakeLists.txt b/contrib/rocksdb-cmake/CMakeLists.txt index 96558b40174..3a14407166c 100644 --- a/contrib/rocksdb-cmake/CMakeLists.txt +++ b/contrib/rocksdb-cmake/CMakeLists.txt @@ -51,6 +51,14 @@ if(ENABLE_SSE42 AND ENABLE_PCLMULQDQ) add_definitions(-DHAVE_PCLMUL) endif() +if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64") + set (HAS_ARMV8_CRC 1) + # the original build descriptions set specific flags for ARM. These flags are already subsumed by ClickHouse's general + # ARM flags, see cmake/cpu_features.cmake + # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") + # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") +endif() + set (HAVE_THREAD_LOCAL 1) if(HAVE_THREAD_LOCAL) add_definitions(-DROCKSDB_SUPPORT_THREAD_LOCAL) From b4f59b96c274fcde50050e172a91d455eddcb17f Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 9 Jul 2024 19:30:15 +0200 Subject: [PATCH 34/70] Update IStorageDataLake.h --- src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index d6935c706d9..c8603fccb86 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -17,10 +17,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} /// Storage for read-only integration with Apache Iceberg tables in Amazon S3 (see https://iceberg.apache.org/) /// Right now it's implemented on top of StorageS3 and right now it doesn't support From 9fc557ad65ab0a306e417d01ea0b4636a0569824 Mon Sep 17 00:00:00 2001 From: MikhailBurdukov Date: Tue, 9 Jul 2024 17:36:09 +0000 Subject: [PATCH 35/70] Ignore ON CLUSTER clause in queries for management of replicated named collections --- .../NamedCollectionsFactory.cpp | 9 +++++- .../NamedCollectionsFactory.h | 2 ++ .../NamedCollectionsMetadataStorage.cpp | 12 ++++---- .../NamedCollectionsMetadataStorage.h | 2 +- src/Core/Settings.h | 1 + .../InterpreterAlterNamedCollectionQuery.cpp | 7 +++-- .../InterpreterCreateNamedCollectionQuery.cpp | 7 +++-- .../InterpreterDropNamedCollectionQuery.cpp | 7 +++-- .../removeOnClusterClauseIfNeeded.cpp | 16 ++++++++++- .../named_collections_with_zookeeper.xml | 17 +++++++++++ .../configs/users.d/users.xml | 5 ++++ .../test_named_collections/test.py | 28 +++++++++++++++++++ 12 files changed, 98 insertions(+), 15 deletions(-) diff --git a/src/Common/NamedCollections/NamedCollectionsFactory.cpp b/src/Common/NamedCollections/NamedCollectionsFactory.cpp index 14105a8651d..2faea1957ba 100644 --- a/src/Common/NamedCollections/NamedCollectionsFactory.cpp +++ b/src/Common/NamedCollections/NamedCollectionsFactory.cpp @@ -235,7 +235,7 @@ bool NamedCollectionFactory::loadIfNot(std::lock_guard & lock) loadFromConfig(context->getConfigRef(), lock); loadFromSQL(lock); - if (metadata_storage->supportsPeriodicUpdate()) + if (metadata_storage->isReplicated()) { update_task = context->getSchedulePool().createTask("NamedCollectionsMetadataStorage", [this]{ updateFunc(); }); update_task->activate(); @@ -357,6 +357,13 @@ void NamedCollectionFactory::reloadFromSQL() add(std::move(collections), lock); } +bool NamedCollectionFactory::usesReplicatedStorage() +{ + std::lock_guard lock(mutex); + loadIfNot(lock); + return metadata_storage->isReplicated(); +} + void NamedCollectionFactory::updateFunc() { LOG_TRACE(log, "Named collections background updating thread started"); diff --git a/src/Common/NamedCollections/NamedCollectionsFactory.h b/src/Common/NamedCollections/NamedCollectionsFactory.h index 6ee5940e686..a0721ad8a50 100644 --- a/src/Common/NamedCollections/NamedCollectionsFactory.h +++ b/src/Common/NamedCollections/NamedCollectionsFactory.h @@ -34,6 +34,8 @@ public: void updateFromSQL(const ASTAlterNamedCollectionQuery & query); + bool usesReplicatedStorage(); + void loadIfNot(); void shutdown(); diff --git a/src/Common/NamedCollections/NamedCollectionsMetadataStorage.cpp b/src/Common/NamedCollections/NamedCollectionsMetadataStorage.cpp index 32fdb25abd3..b3671350f92 100644 --- a/src/Common/NamedCollections/NamedCollectionsMetadataStorage.cpp +++ b/src/Common/NamedCollections/NamedCollectionsMetadataStorage.cpp @@ -67,7 +67,7 @@ public: virtual bool removeIfExists(const std::string & path) = 0; - virtual bool supportsPeriodicUpdate() const = 0; + virtual bool isReplicated() const = 0; virtual bool waitUpdate(size_t /* timeout */) { return false; } }; @@ -89,7 +89,7 @@ public: ~LocalStorage() override = default; - bool supportsPeriodicUpdate() const override { return false; } + bool isReplicated() const override { return false; } std::vector list() const override { @@ -221,7 +221,7 @@ public: ~ZooKeeperStorage() override = default; - bool supportsPeriodicUpdate() const override { return true; } + bool isReplicated() const override { return true; } /// Return true if children changed. bool waitUpdate(size_t timeout) override @@ -465,14 +465,14 @@ void NamedCollectionsMetadataStorage::writeCreateQuery(const ASTCreateNamedColle storage->write(getFileName(query.collection_name), serializeAST(*normalized_query), replace); } -bool NamedCollectionsMetadataStorage::supportsPeriodicUpdate() const +bool NamedCollectionsMetadataStorage::isReplicated() const { - return storage->supportsPeriodicUpdate(); + return storage->isReplicated(); } bool NamedCollectionsMetadataStorage::waitUpdate() { - if (!storage->supportsPeriodicUpdate()) + if (!storage->isReplicated()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Periodic updates are not supported"); const auto & config = Context::getGlobalContextInstance()->getConfigRef(); diff --git a/src/Common/NamedCollections/NamedCollectionsMetadataStorage.h b/src/Common/NamedCollections/NamedCollectionsMetadataStorage.h index 3c089fe2fa2..c3468fbc468 100644 --- a/src/Common/NamedCollections/NamedCollectionsMetadataStorage.h +++ b/src/Common/NamedCollections/NamedCollectionsMetadataStorage.h @@ -30,7 +30,7 @@ public: /// Return true if update was made bool waitUpdate(); - bool supportsPeriodicUpdate() const; + bool isReplicated() const; private: class INamedCollectionsStorage; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index d84e5b149f6..6c53837138b 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -364,6 +364,7 @@ class IColumn; \ M(Bool, ignore_on_cluster_for_replicated_udf_queries, false, "Ignore ON CLUSTER clause for replicated UDF management queries.", 0) \ M(Bool, ignore_on_cluster_for_replicated_access_entities_queries, false, "Ignore ON CLUSTER clause for replicated access entities management queries.", 0) \ + M(Bool, ignore_on_cluster_for_replicated_named_collections_queries, false, "Ignore ON CLUSTER clause for replicated named collections management queries.", 0) \ /** Settings for testing hedged requests */ \ M(Milliseconds, sleep_in_send_tables_status_ms, 0, "Time to sleep in sending tables status response in TCPHandler", 0) \ M(Milliseconds, sleep_in_send_data_ms, 0, "Time to sleep in sending data in TCPHandler", 0) \ diff --git a/src/Interpreters/InterpreterAlterNamedCollectionQuery.cpp b/src/Interpreters/InterpreterAlterNamedCollectionQuery.cpp index 79a17fd1844..0e83e2039f6 100644 --- a/src/Interpreters/InterpreterAlterNamedCollectionQuery.cpp +++ b/src/Interpreters/InterpreterAlterNamedCollectionQuery.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -13,14 +14,16 @@ namespace DB BlockIO InterpreterAlterNamedCollectionQuery::execute() { auto current_context = getContext(); - const auto & query = query_ptr->as(); + + const auto updated_query = removeOnClusterClauseIfNeeded(query_ptr, getContext()); + const auto & query = updated_query->as(); current_context->checkAccess(AccessType::ALTER_NAMED_COLLECTION, query.collection_name); if (!query.cluster.empty()) { DDLQueryOnClusterParams params; - return executeDDLQueryOnCluster(query_ptr, current_context, params); + return executeDDLQueryOnCluster(updated_query, current_context, params); } NamedCollectionFactory::instance().updateFromSQL(query); diff --git a/src/Interpreters/InterpreterCreateNamedCollectionQuery.cpp b/src/Interpreters/InterpreterCreateNamedCollectionQuery.cpp index c71441daa8c..b4920b1729f 100644 --- a/src/Interpreters/InterpreterCreateNamedCollectionQuery.cpp +++ b/src/Interpreters/InterpreterCreateNamedCollectionQuery.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -13,14 +14,16 @@ namespace DB BlockIO InterpreterCreateNamedCollectionQuery::execute() { auto current_context = getContext(); - const auto & query = query_ptr->as(); + + const auto updated_query = removeOnClusterClauseIfNeeded(query_ptr, getContext()); + const auto & query = updated_query->as(); current_context->checkAccess(AccessType::CREATE_NAMED_COLLECTION, query.collection_name); if (!query.cluster.empty()) { DDLQueryOnClusterParams params; - return executeDDLQueryOnCluster(query_ptr, current_context, params); + return executeDDLQueryOnCluster(updated_query, current_context, params); } NamedCollectionFactory::instance().createFromSQL(query); diff --git a/src/Interpreters/InterpreterDropNamedCollectionQuery.cpp b/src/Interpreters/InterpreterDropNamedCollectionQuery.cpp index 2edaef1b2f2..6233d21b439 100644 --- a/src/Interpreters/InterpreterDropNamedCollectionQuery.cpp +++ b/src/Interpreters/InterpreterDropNamedCollectionQuery.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -13,14 +14,16 @@ namespace DB BlockIO InterpreterDropNamedCollectionQuery::execute() { auto current_context = getContext(); - const auto & query = query_ptr->as(); + + const auto updated_query = removeOnClusterClauseIfNeeded(query_ptr, getContext()); + const auto & query = updated_query->as(); current_context->checkAccess(AccessType::DROP_NAMED_COLLECTION, query.collection_name); if (!query.cluster.empty()) { DDLQueryOnClusterParams params; - return executeDDLQueryOnCluster(query_ptr, current_context, params); + return executeDDLQueryOnCluster(updated_query, current_context, params); } NamedCollectionFactory::instance().removeFromSQL(query); diff --git a/src/Interpreters/removeOnClusterClauseIfNeeded.cpp b/src/Interpreters/removeOnClusterClauseIfNeeded.cpp index 44167fe7242..dd20164925c 100644 --- a/src/Interpreters/removeOnClusterClauseIfNeeded.cpp +++ b/src/Interpreters/removeOnClusterClauseIfNeeded.cpp @@ -15,6 +15,10 @@ #include #include #include +#include +#include +#include +#include namespace DB @@ -38,6 +42,13 @@ static bool isAccessControlQuery(const ASTPtr & query) || query->as(); } +static bool isNamedCollectionQuery(const ASTPtr & query) +{ + return query->as() + || query->as() + || query->as(); +} + ASTPtr removeOnClusterClauseIfNeeded(const ASTPtr & query, ContextPtr context, const WithoutOnClusterASTRewriteParams & params) { auto * query_on_cluster = dynamic_cast(query.get()); @@ -50,7 +61,10 @@ ASTPtr removeOnClusterClauseIfNeeded(const ASTPtr & query, ContextPtr context, c && context->getUserDefinedSQLObjectsStorage().isReplicated()) || (isAccessControlQuery(query) && context->getSettings().ignore_on_cluster_for_replicated_access_entities_queries - && context->getAccessControl().containsStorage(ReplicatedAccessStorage::STORAGE_TYPE))) + && context->getAccessControl().containsStorage(ReplicatedAccessStorage::STORAGE_TYPE)) + || (isNamedCollectionQuery(query) + && context->getSettings().ignore_on_cluster_for_replicated_named_collections_queries + && NamedCollectionFactory::instance().usesReplicatedStorage())) { LOG_DEBUG(getLogger("removeOnClusterClauseIfNeeded"), "ON CLUSTER clause was ignored for query {}", query->getID()); return query_on_cluster->getRewrittenASTWithoutOnCluster(params); diff --git a/tests/integration/test_named_collections/configs/config.d/named_collections_with_zookeeper.xml b/tests/integration/test_named_collections/configs/config.d/named_collections_with_zookeeper.xml index 2d7946d1587..43d80ee6f69 100644 --- a/tests/integration/test_named_collections/configs/config.d/named_collections_with_zookeeper.xml +++ b/tests/integration/test_named_collections/configs/config.d/named_collections_with_zookeeper.xml @@ -9,4 +9,21 @@ value1 + + + + + true + + node_with_keeper + 9000 + + + node_with_keeper_2 + 9000 + + + true + + diff --git a/tests/integration/test_named_collections/configs/users.d/users.xml b/tests/integration/test_named_collections/configs/users.d/users.xml index 15da914f666..7d4f0543ff1 100644 --- a/tests/integration/test_named_collections/configs/users.d/users.xml +++ b/tests/integration/test_named_collections/configs/users.d/users.xml @@ -1,4 +1,9 @@ + + + 0 + + diff --git a/tests/integration/test_named_collections/test.py b/tests/integration/test_named_collections/test.py index dbc502236c0..5d38047e885 100644 --- a/tests/integration/test_named_collections/test.py +++ b/tests/integration/test_named_collections/test.py @@ -3,6 +3,8 @@ import pytest import os import time from helpers.cluster import ClickHouseCluster +from contextlib import nullcontext as does_not_raise +from helpers.client import QueryRuntimeException SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) NAMED_COLLECTIONS_CONFIG = os.path.join( @@ -761,3 +763,29 @@ def test_keeper_storage(cluster): check_dropped(node1) check_dropped(node2) + + +@pytest.mark.parametrize( + "ignore, expected_raise", + [(True, does_not_raise()), (False, pytest.raises(QueryRuntimeException))], +) +def test_keeper_storage_remove_on_cluster(cluster, ignore, expected_raise): + node = cluster.instances["node_with_keeper"] + + replace_in_users_config( + node, + "ignore_on_cluster_for_replicated_named_collections_queries>.", + f"ignore_on_cluster_for_replicated_named_collections_queries>{int(ignore)}", + ) + node.query("SYSTEM RELOAD CONFIG") + + with expected_raise: + node.query( + f"CREATE NAMED COLLECTION test_nc ON CLUSTER `replicated_nc_nodes_cluster` AS key1=1, key2=2 OVERRIDABLE" + ) + node.query( + f"ALTER NAMED COLLECTION test_nc ON CLUSTER `replicated_nc_nodes_cluster` SET key2=3" + ) + node.query( + f"DROP NAMED COLLECTION test_nc ON CLUSTER `replicated_nc_nodes_cluster`" + ) From 5a12659f43f74aa501610404c4b2ee6b1b4a02c9 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 9 Jul 2024 20:18:03 +0200 Subject: [PATCH 36/70] Update run.sh --- docker/test/stateless/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 43d3c698d8a..637d277e6f8 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -249,7 +249,7 @@ function run_tests() try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')" set +e - timeout -s TERM --preserve-status 120m clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \ + timeout -s KILL --preserve-status 120m clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \ --no-drop-if-fail --test-runs "$NUM_TRIES" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \ | ts '%Y-%m-%d %H:%M:%S' \ | tee -a test_output/test_result.txt From 0d54151cb81421b8eaa99df0c8abb224b776570b Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 9 Jul 2024 19:55:37 +0000 Subject: [PATCH 37/70] Make the pocketfft to point to the upstream/master branch --- contrib/pocketfft | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/pocketfft b/contrib/pocketfft index 9efd4da52cf..f4c1aa8aa9c 160000 --- a/contrib/pocketfft +++ b/contrib/pocketfft @@ -1 +1 @@ -Subproject commit 9efd4da52cf8d28d14531d14e43ad9d913807546 +Subproject commit f4c1aa8aa9ce79ad39e80f2c9c41b92ead90fda3 From 2b091983e8df97a5a103be8aa03ad2c0a836ff46 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 9 Jul 2024 19:59:49 +0000 Subject: [PATCH 38/70] Bump Azure to https://github.com/ClickHouse/azure-sdk-for-cpp/commit/ea3e19a7be08519134c643177d56c7484dfec884 --- contrib/azure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/azure b/contrib/azure index 92c94d7f37a..ea3e19a7be0 160000 --- a/contrib/azure +++ b/contrib/azure @@ -1 +1 @@ -Subproject commit 92c94d7f37a43cc8fc4d466884a95f610c0593bf +Subproject commit ea3e19a7be08519134c643177d56c7484dfec884 From ebc87d0c702e9bb26814718fec97e4c938735dec Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 9 Jul 2024 22:58:06 +0200 Subject: [PATCH 39/70] Update run.sh --- docker/test/stateless/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 637d277e6f8..1c03f5107b0 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -249,7 +249,7 @@ function run_tests() try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')" set +e - timeout -s KILL --preserve-status 120m clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \ + timeout -s TERM --preserve-status 120m -k 60m clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \ --no-drop-if-fail --test-runs "$NUM_TRIES" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \ | ts '%Y-%m-%d %H:%M:%S' \ | tee -a test_output/test_result.txt From bc02d8e66ecc82bee3c8d0402b01816c5005ece9 Mon Sep 17 00:00:00 2001 From: MikhailBurdukov Date: Wed, 10 Jul 2024 08:01:36 +0000 Subject: [PATCH 40/70] Fix settings changelog --- src/Core/SettingsChangesHistory.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index b0725340f46..3ccc7321088 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -59,6 +59,7 @@ static std::initializer_list Date: Wed, 10 Jul 2024 11:56:43 +0200 Subject: [PATCH 41/70] Update run.sh --- docker/test/stateless/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 1c03f5107b0..8e66d2667f1 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -249,7 +249,7 @@ function run_tests() try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')" set +e - timeout -s TERM --preserve-status 120m -k 60m clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \ + timeout -k 60m -s TERM --preserve-status 120m clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \ --no-drop-if-fail --test-runs "$NUM_TRIES" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \ | ts '%Y-%m-%d %H:%M:%S' \ | tee -a test_output/test_result.txt From a32795d116903c66c18263f47e5d1e622d83a362 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 10 Jul 2024 10:07:02 +0000 Subject: [PATCH 42/70] Fix review comments --- src/Formats/JSONExtractTree.cpp | 174 ++++++++++++++++++++++++-------- src/Formats/JSONExtractTree.h | 6 ++ src/Functions/FunctionsJSON.cpp | 3 + 3 files changed, 139 insertions(+), 44 deletions(-) diff --git a/src/Formats/JSONExtractTree.cpp b/src/Formats/JSONExtractTree.cpp index 9efb1392583..242d2dc9f80 100644 --- a/src/Formats/JSONExtractTree.cpp +++ b/src/Formats/JSONExtractTree.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -35,9 +36,8 @@ #include #include #include -#include #include -#include +#include #include #include @@ -123,10 +123,7 @@ void jsonElementToString(const typename JSONParser::Element & element, WriteBuff template bool tryGetNumericValueFromJSONElement( - NumberType & value, - const typename JSONParser::Element & element, - bool convert_bool_to_integer, - String & error) + NumberType & value, const typename JSONParser::Element & element, bool convert_bool_to_integer, String & error) { switch (element.type()) { @@ -226,7 +223,11 @@ public: explicit NumericNode(bool is_bool_type_ = false) : is_bool_type(is_bool_type_) { } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings & insert_settings, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull()) { @@ -270,7 +271,11 @@ public: } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings & insert_settings, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull()) { @@ -309,7 +314,11 @@ class StringNode : public JSONExtractTreeNode { public: bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull()) { @@ -349,7 +358,11 @@ public: explicit LowCardinalityStringNode(bool is_nullable_) : is_nullable(is_nullable_) { } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull()) { @@ -387,7 +400,11 @@ class FixedStringNode : public JSONExtractTreeNode public: explicit FixedStringNode(size_t fixed_length_) : fixed_length(fixed_length_) { } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull()) { @@ -431,7 +448,11 @@ public: } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull()) { @@ -484,7 +505,11 @@ class UUIDNode : public JSONExtractTreeNode { public: bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull() && format_settings.null_as_default) { @@ -525,7 +550,11 @@ public: explicit LowCardinalityUUIDNode(bool is_nullable_) : is_nullable(is_nullable_) { } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull() && (is_nullable || format_settings.null_as_default)) { @@ -560,7 +589,11 @@ class DateNode : public JSONExtractTreeNode { public: bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull() && format_settings.null_as_default) { @@ -595,7 +628,11 @@ public: explicit DateTimeNode(const DataTypeDateTime & datetime_type) : TimezoneMixin(datetime_type) { } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull() && format_settings.null_as_default) { @@ -656,7 +693,11 @@ public: explicit DecimalNode(const DataTypePtr & type) : scale(assert_cast &>(*type).getScale()) { } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { DecimalType value{}; @@ -688,7 +729,8 @@ public: } break; } - default: { + default: + { error = fmt::format("cannot read Decimal value from JSON element: {}", jsonElementToString(element, format_settings)); return false; } @@ -707,10 +749,16 @@ template class DateTime64Node : public JSONExtractTreeNode, public TimezoneMixin { public: - explicit DateTime64Node(const DataTypeDateTime64 & datetime64_type) : TimezoneMixin(datetime64_type), scale(datetime64_type.getScale()) { } + explicit DateTime64Node(const DataTypeDateTime64 & datetime64_type) : TimezoneMixin(datetime64_type), scale(datetime64_type.getScale()) + { + } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull() && format_settings.null_as_default) { @@ -790,7 +838,11 @@ public: } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull()) { @@ -857,7 +909,11 @@ class IPv4Node : public JSONExtractTreeNode { public: bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull() && format_settings.null_as_default) { @@ -895,7 +951,11 @@ class IPv6Node : public JSONExtractTreeNode { public: bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings &, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings &, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull() && format_settings.null_as_default) { @@ -936,7 +996,11 @@ public: explicit NullableNode(std::unique_ptr> nested_) : nested(std::move(nested_)) { } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings & insert_settings, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull()) { @@ -945,7 +1009,7 @@ public: } auto & col_null = assert_cast(column); - if (!nested-> insertResultToColumn(col_null.getNestedColumn(), element, insert_settings, format_settings, error)) + if (!nested->insertResultToColumn(col_null.getNestedColumn(), element, insert_settings, format_settings, error)) return false; col_null.getNullMapColumn().insertValue(0); return true; @@ -965,7 +1029,11 @@ public: } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings & insert_settings, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull() && (is_nullable || format_settings.null_as_default)) { @@ -975,7 +1043,7 @@ public: auto & col_lc = assert_cast(column); auto tmp_nested = col_lc.getDictionary().getNestedColumn()->cloneEmpty(); - if (!nested-> insertResultToColumn(*tmp_nested, element, insert_settings, format_settings, error)) + if (!nested->insertResultToColumn(*tmp_nested, element, insert_settings, format_settings, error)) return false; col_lc.insertFromFullColumn(*tmp_nested, 0); @@ -994,7 +1062,11 @@ public: explicit ArrayNode(std::unique_ptr> nested_) : nested(std::move(nested_)) { } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings & insert_settings, + const FormatSettings & format_settings, + String & error) const override { if (element.isNull() && format_settings.null_as_default) { @@ -1017,7 +1089,7 @@ public: for (auto value : array) { - if (nested-> insertResultToColumn(data, value, insert_settings, format_settings, error)) + if (nested->insertResultToColumn(data, value, insert_settings, format_settings, error)) { were_valid_elements = true; } @@ -1058,7 +1130,11 @@ public: } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings & insert_settings, + const FormatSettings & format_settings, + String & error) const override { auto & tuple = assert_cast(column); size_t old_size = column.size(); @@ -1087,7 +1163,7 @@ public: for (size_t index = 0; (index != nested.size()) && (it != array.end()); ++index) { - if (nested[index]-> insertResultToColumn(tuple.getColumn(index), *it++, insert_settings, format_settings, error)) + if (nested[index]->insertResultToColumn(tuple.getColumn(index), *it++, insert_settings, format_settings, error)) { were_valid_elements = true; } @@ -1115,7 +1191,7 @@ public: auto it = object.begin(); for (size_t index = 0; (index != nested.size()) && (it != object.end()); ++index) { - if (nested[index]-> insertResultToColumn(tuple.getColumn(index), (*it++).second, insert_settings, format_settings, error)) + if (nested[index]->insertResultToColumn(tuple.getColumn(index), (*it++).second, insert_settings, format_settings, error)) { were_valid_elements = true; } @@ -1138,7 +1214,7 @@ public: auto index = name_to_index_map.find(key); if (index != name_to_index_map.end()) { - if (nested[index->second]-> insertResultToColumn(tuple.getColumn(index->second), value, insert_settings, format_settings, error)) + if (nested[index->second]->insertResultToColumn(tuple.getColumn(index->second), value, insert_settings, format_settings, error)) { were_valid_elements = true; } @@ -1173,7 +1249,11 @@ public: explicit MapNode(std::unique_ptr> value_) : value(std::move(value_)) { } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings & insert_settings, + const FormatSettings & format_settings, + String & error) const override { if (!element.isObject()) { @@ -1198,7 +1278,7 @@ public: key_col.insertData(pair.first.data(), pair.first.size()); /// Insert value - if (!value-> insertResultToColumn(value_col, pair.second, insert_settings, format_settings, error)) + if (!value->insertResultToColumn(value_col, pair.second, insert_settings, format_settings, error)) { if (insert_settings.insert_default_on_invalid_elements_in_complex_types) { @@ -1232,13 +1312,17 @@ public: } bool insertResultToColumn( - IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings & insert_settings, + const FormatSettings & format_settings, + String & error) const override { auto & column_variant = assert_cast(column); for (size_t i : order) { auto & variant = column_variant.getVariantByGlobalDiscriminator(i); - if (variant_nodes[i]-> insertResultToColumn(variant, element, insert_settings, format_settings, error)) + if (variant_nodes[i]->insertResultToColumn(variant, element, insert_settings, format_settings, error)) { column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(i)); column_variant.getOffsets().push_back(variant.size() - 1); @@ -1262,7 +1346,12 @@ template class DynamicNode : public JSONExtractTreeNode { public: - bool insertResultToColumn(IColumn & column, const typename JSONParser::Element & element, const JSONExtractInsertSettings & insert_settings, const FormatSettings & format_settings, String & error) const override + bool insertResultToColumn( + IColumn & column, + const typename JSONParser::Element & element, + const JSONExtractInsertSettings & insert_settings, + const FormatSettings & format_settings, + String & error) const override { auto & column_dynamic = assert_cast(column); /// First, check if element is NULL. @@ -1281,7 +1370,7 @@ public: auto node = buildJSONExtractTree(element_type, "Dynamic inference"); auto global_discriminator = variant_info.variant_name_to_discriminator[element_type->getName()]; auto & variant = variant_column.getVariantByGlobalDiscriminator(global_discriminator); - if (!node-> insertResultToColumn(variant, element, insert_settings, format_settings, error)) + if (!node->insertResultToColumn(variant, element, insert_settings, format_settings, error)) return false; variant_column.getLocalDiscriminators().push_back(variant_column.localDiscriminatorByGlobal(global_discriminator)); variant_column.getOffsets().push_back(variant.size() - 1); @@ -1290,14 +1379,14 @@ public: /// We couldn't add new variant. Try to insert element into current variants. auto variant_node = buildJSONExtractTree(variant_info.variant_type, "Dynamic inference"); - if (variant_node-> insertResultToColumn(variant_column, element, insert_settings, format_settings, error)) + if (variant_node->insertResultToColumn(variant_column, element, insert_settings, format_settings, error)) return true; /// We couldn't insert element into any existing variant, add String variant and read value as String. column_dynamic.addStringVariant(); auto string_global_discriminator = variant_info.variant_name_to_discriminator["String"]; auto & string_column = variant_column.getVariantByGlobalDiscriminator(string_global_discriminator); - if (!getStringNode()-> insertResultToColumn(string_column, element, insert_settings, format_settings, error)) + if (!getStringNode()->insertResultToColumn(string_column, element, insert_settings, format_settings, error)) return false; variant_column.getLocalDiscriminators().push_back(variant_column.localDiscriminatorByGlobal(string_global_discriminator)); variant_column.getOffsets().push_back(string_column.size() - 1); @@ -1348,12 +1437,9 @@ private: if (format_settings.json.try_infer_numbers_from_strings) { - bool is_negative = false; if (auto type = tryInferJSONNumberFromString(data, format_settings, &json_inference_info)) { json_inference_info.numbers_parsed_from_json_strings.insert(type.get()); - if (is_negative) - json_inference_info.negative_integers.insert(type.get()); return type; } } diff --git a/src/Formats/JSONExtractTree.h b/src/Formats/JSONExtractTree.h index 4735f568b1c..b5e82506548 100644 --- a/src/Formats/JSONExtractTree.h +++ b/src/Formats/JSONExtractTree.h @@ -9,7 +9,13 @@ namespace DB struct JSONExtractInsertSettings { + /// If false, JSON boolean values won't be inserted into columns with integer types + /// It's used in JSONExtractInt64/JSONExtractUInt64/... functions. bool convert_bool_to_integer = true; + /// If true, when complex type like Array/Map has both valid and invalid elements, + /// the default value will be inserted on invalid elements. + /// For example, if we have [1, "hello", 2] and type Array(UInt32), + /// we will insert [1, 0, 2] in the column. Used in all JSONExtract functions. bool insert_default_on_invalid_elements_in_complex_types = false; }; diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index ca233becb63..db1602b1939 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -354,7 +354,10 @@ public: explicit ExecutableFunctionJSON(const NullPresence & null_presence_, bool allow_simdjson_, const DataTypePtr & json_return_type_, const FormatSettings & format_settings_) : null_presence(null_presence_), allow_simdjson(allow_simdjson_), json_return_type(json_return_type_), format_settings(format_settings_) { + /// Don't escape forward slashes during converting JSON elements to raw string. format_settings.json.escape_forward_slashes = false; + /// Don't insert default values on null during traversing the JSON element. + /// We allow to insert null only to Nullable columns in JSONExtract functions. format_settings.null_as_default = false; } From 6ffac4034a4ca1b18a2ad5aa44f2c7cce696c246 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Wed, 10 Jul 2024 14:20:12 +0200 Subject: [PATCH 43/70] Enable checks in assert_cast under sanitizers --- src/Common/assert_cast.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/assert_cast.h b/src/Common/assert_cast.h index 0b73ba1cc12..f9d0bf0e595 100644 --- a/src/Common/assert_cast.h +++ b/src/Common/assert_cast.h @@ -25,7 +25,7 @@ namespace DB template inline To assert_cast(From && from) { -#ifndef NDEBUG +#ifdef ABORT_ON_LOGICAL_ERROR try { if constexpr (std::is_pointer_v) From f8b9fe621a9b249764538d08dbf361e3ba4a1d49 Mon Sep 17 00:00:00 2001 From: MikhailBurdukov Date: Wed, 10 Jul 2024 12:47:19 +0000 Subject: [PATCH 44/70] Fix test --- tests/integration/test_named_collections/test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/test_named_collections/test.py b/tests/integration/test_named_collections/test.py index 5d38047e885..32846c79d23 100644 --- a/tests/integration/test_named_collections/test.py +++ b/tests/integration/test_named_collections/test.py @@ -780,6 +780,9 @@ def test_keeper_storage_remove_on_cluster(cluster, ignore, expected_raise): node.query("SYSTEM RELOAD CONFIG") with expected_raise: + node.query( + "DROP NAMED COLLECTION IF EXISTS test_nc ON CLUSTER `replicated_nc_nodes_cluster`" + ) node.query( f"CREATE NAMED COLLECTION test_nc ON CLUSTER `replicated_nc_nodes_cluster` AS key1=1, key2=2 OVERRIDABLE" ) From 9c0610ec2bf454ba4740a2117fb5b0d03510607f Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 10 Jul 2024 21:27:15 +0200 Subject: [PATCH 45/70] add remaining window functions --- .../window-functions/first_value.md | 72 +++++++++++++++++++ .../sql-reference/window-functions/index.md | 4 +- .../window-functions/last_value.md | 72 +++++++++++++++++++ .../window-functions/leadInFrame.md | 2 +- .../window-functions/nth_value.md | 24 +++---- .../en/sql-reference/window-functions/rank.md | 2 +- .../window-functions/row_number.md | 2 +- 7 files changed, 161 insertions(+), 17 deletions(-) create mode 100644 docs/en/sql-reference/window-functions/first_value.md create mode 100644 docs/en/sql-reference/window-functions/last_value.md diff --git a/docs/en/sql-reference/window-functions/first_value.md b/docs/en/sql-reference/window-functions/first_value.md new file mode 100644 index 00000000000..575a6fc3f48 --- /dev/null +++ b/docs/en/sql-reference/window-functions/first_value.md @@ -0,0 +1,72 @@ +--- +slug: /en/sql-reference/window-functions/first_value +sidebar_label: first_value +sidebar_position: 3 +--- + +# first_value + +Returns the first non-NULL value evaluated within its ordered frame. + +**Syntax** + +```sql +first_value (column_name) + OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] + [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) +FROM table_name +WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column]) +``` + +For more detail on window function syntax see: [Window Functions - Syntax](./index.md/#syntax). + +**Returned value** + +- The first non-NULL value evaluated within its ordered frame. + +**Example** + +In this example the `first_value` function is used to find the highest paid footballer from a fictional dataset of salaries of Premier League football players. + +Query: + +```sql +DROP TABLE IF EXISTS salaries; +CREATE TABLE salaries +( + `team` String, + `player` String, + `salary` UInt32, + `position` String +) +Engine = Memory; + +INSERT INTO salaries FORMAT Values + ('Port Elizabeth Barbarians', 'Gary Chen', 196000, 'F'), + ('New Coreystad Archdukes', 'Charles Juarez', 190000, 'F'), + ('Port Elizabeth Barbarians', 'Michael Stanley', 100000, 'D'), + ('New Coreystad Archdukes', 'Scott Harrison', 180000, 'D'), + ('Port Elizabeth Barbarians', 'Robert George', 195000, 'M'), + ('South Hampton Seagulls', 'Douglas Benson', 150000, 'M'), + ('South Hampton Seagulls', 'James Henderson', 140000, 'M'); +``` + +```sql +SELECT player, salary, + first_value(player) OVER (ORDER BY salary DESC) AS highest_paid_player +FROM salaries; +``` + +Result: + +```response + ┌─player──────────┬─salary─┬─highest_paid_player─┐ +1. │ Gary Chen │ 196000 │ Gary Chen │ +2. │ Robert George │ 195000 │ Gary Chen │ +3. │ Charles Juarez │ 190000 │ Gary Chen │ +4. │ Scott Harrison │ 180000 │ Gary Chen │ +5. │ Douglas Benson │ 150000 │ Gary Chen │ +6. │ James Henderson │ 140000 │ Gary Chen │ +7. │ Michael Stanley │ 100000 │ Gary Chen │ + └─────────────────┴────────┴─────────────────────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index ee54a679ba1..d18dbcc189d 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -76,8 +76,8 @@ WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column] These functions can be used only as a window function. - [`row_number()`](./row_number.md) - Number the current row within its partition starting from 1. -- `first_value(x)` - Return the first non-NULL value evaluated within its ordered frame. -- `last_value(x)` - Return the last non-NULL value evaluated within its ordered frame. +- [`first_value(x)`](./first_value.md) - Return the first non-NULL value evaluated within its ordered frame. +- [`last_value(x)`](./last_value.md) - Return the last non-NULL value evaluated within its ordered frame. - [`nth_value(x, offset)`](./nth_value.md) - Return the first non-NULL value evaluated against the nth row (offset) in its ordered frame. - [`rank()`](./rank.md) - Rank the current row within its partition with gaps. - [`dense_rank()`](./dense_rank.md) - Rank the current row within its partition without gaps. diff --git a/docs/en/sql-reference/window-functions/last_value.md b/docs/en/sql-reference/window-functions/last_value.md new file mode 100644 index 00000000000..098ee81ceb3 --- /dev/null +++ b/docs/en/sql-reference/window-functions/last_value.md @@ -0,0 +1,72 @@ +--- +slug: /en/sql-reference/window-functions/lagInFrame +sidebar_label: lagInFrame +sidebar_position: 4 +--- + +# first_value + +Return the last non-NULL value evaluated within its ordered frame. + +**Syntax** + +```sql +first_value (column_name) + OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] + [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) +FROM table_name +WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column]) +``` + +For more detail on window function syntax see: [Window Functions - Syntax](./index.md/#syntax). + +**Returned value** + +- The last non-NULL value evaluated within its ordered frame. + +**Example** + +In this example the `last_value` function is used to find the highest paid footballer from a fictional dataset of salaries of Premier League football players. + +Query: + +```sql +DROP TABLE IF EXISTS salaries; +CREATE TABLE salaries +( + `team` String, + `player` String, + `salary` UInt32, + `position` String +) +Engine = Memory; + +INSERT INTO salaries FORMAT Values + ('Port Elizabeth Barbarians', 'Gary Chen', 196000, 'F'), + ('New Coreystad Archdukes', 'Charles Juarez', 190000, 'F'), + ('Port Elizabeth Barbarians', 'Michael Stanley', 100000, 'D'), + ('New Coreystad Archdukes', 'Scott Harrison', 180000, 'D'), + ('Port Elizabeth Barbarians', 'Robert George', 195000, 'M'), + ('South Hampton Seagulls', 'Douglas Benson', 150000, 'M'), + ('South Hampton Seagulls', 'James Henderson', 140000, 'M'); +``` + +```sql +SELECT player, salary, + last_value(player) OVER (ORDER BY salary DESC RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS lowest_paid_player +FROM salaries; +``` + +Result: + +```response + ┌─player──────────┬─salary─┬─lowest_paid_player─┐ +1. │ Gary Chen │ 196000 │ Michael Stanley │ +2. │ Robert George │ 195000 │ Michael Stanley │ +3. │ Charles Juarez │ 190000 │ Michael Stanley │ +4. │ Scott Harrison │ 180000 │ Michael Stanley │ +5. │ Douglas Benson │ 150000 │ Michael Stanley │ +6. │ James Henderson │ 140000 │ Michael Stanley │ +7. │ Michael Stanley │ 100000 │ Michael Stanley │ + └─────────────────┴────────┴────────────────────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/window-functions/leadInFrame.md b/docs/en/sql-reference/window-functions/leadInFrame.md index 0cb4eea52b2..33f69c0dcae 100644 --- a/docs/en/sql-reference/window-functions/leadInFrame.md +++ b/docs/en/sql-reference/window-functions/leadInFrame.md @@ -1,7 +1,7 @@ --- slug: /en/sql-reference/window-functions/leadInFrame sidebar_label: leadInFrame -sidebar_position: 4 +sidebar_position: 5 --- # leadInFrame diff --git a/docs/en/sql-reference/window-functions/nth_value.md b/docs/en/sql-reference/window-functions/nth_value.md index 26c90110aaa..5c430707009 100644 --- a/docs/en/sql-reference/window-functions/nth_value.md +++ b/docs/en/sql-reference/window-functions/nth_value.md @@ -1,7 +1,7 @@ --- slug: /en/sql-reference/window-functions/leadInFrame sidebar_label: leadInFrame -sidebar_position: 5 +sidebar_position: 6 --- # nth_value @@ -51,7 +51,7 @@ Engine = Memory; INSERT INTO salaries FORMAT Values ('Port Elizabeth Barbarians', 'Gary Chen', 195000, 'F'), ('New Coreystad Archdukes', 'Charles Juarez', 190000, 'F'), - ('Port Elizabeth Barbarians', 'Michael Stanley', 10000, 'D'), + ('Port Elizabeth Barbarians', 'Michael Stanley', 100000, 'D'), ('New Coreystad Archdukes', 'Scott Harrison', 180000, 'D'), ('Port Elizabeth Barbarians', 'Robert George', 195000, 'M'), ('South Hampton Seagulls', 'Douglas Benson', 150000, 'M'), @@ -59,19 +59,19 @@ INSERT INTO salaries FORMAT Values ``` ```sql -SELECT salary, nth_value(salary,3) OVER(ORDER BY salary DESC) FROM salaries GROUP BY salary; +SELECT player, salary, nth_value(player,3) OVER(ORDER BY salary DESC) AS third_highest_salary FROM salaries; ``` Result: ```response - ┌─player──────────┬─salary─┬─rank─┐ -1. │ Gary Chen │ 195000 │ 1 │ -2. │ Robert George │ 195000 │ 1 │ -3. │ Charles Juarez │ 190000 │ 3 │ -4. │ Douglas Benson │ 150000 │ 4 │ -5. │ Michael Stanley │ 150000 │ 4 │ -6. │ Scott Harrison │ 150000 │ 4 │ -7. │ James Henderson │ 140000 │ 7 │ - └─────────────────┴────────┴──────┘ + ┌─player──────────┬─salary─┬─third_highest_salary─┐ +1. │ Gary Chen │ 195000 │ │ +2. │ Robert George │ 195000 │ │ +3. │ Charles Juarez │ 190000 │ Charles Juarez │ +4. │ Scott Harrison │ 180000 │ Charles Juarez │ +5. │ Douglas Benson │ 150000 │ Charles Juarez │ +6. │ James Henderson │ 140000 │ Charles Juarez │ +7. │ Michael Stanley │ 100000 │ Charles Juarez │ + └─────────────────┴────────┴──────────────────────┘ ``` \ No newline at end of file diff --git a/docs/en/sql-reference/window-functions/rank.md b/docs/en/sql-reference/window-functions/rank.md index 9ac99dde6df..d7ed8d79c35 100644 --- a/docs/en/sql-reference/window-functions/rank.md +++ b/docs/en/sql-reference/window-functions/rank.md @@ -1,7 +1,7 @@ --- slug: /en/sql-reference/window-functions/rank sidebar_label: rank -sidebar_position: 6 +sidebar_position: 7 --- # rank diff --git a/docs/en/sql-reference/window-functions/row_number.md b/docs/en/sql-reference/window-functions/row_number.md index e7165d60169..485ca355f12 100644 --- a/docs/en/sql-reference/window-functions/row_number.md +++ b/docs/en/sql-reference/window-functions/row_number.md @@ -1,7 +1,7 @@ --- slug: /en/sql-reference/window-functions/row_number sidebar_label: row_number -sidebar_position: 7 +sidebar_position: 8 --- # row_number From 4d60ff6a91b9d17744a8522e3da0b850215a76d2 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 10 Jul 2024 21:51:14 +0200 Subject: [PATCH 46/70] small updates --- docs/en/sql-reference/window-functions/dense_rank.md | 4 ++-- docs/en/sql-reference/window-functions/lagInFrame.md | 4 ++-- docs/en/sql-reference/window-functions/last_value.md | 10 +++++----- docs/en/sql-reference/window-functions/leadInFrame.md | 4 ++-- docs/en/sql-reference/window-functions/nth_value.md | 10 ++++------ docs/en/sql-reference/window-functions/rank.md | 4 ++-- docs/en/sql-reference/window-functions/row_number.md | 4 ++-- 7 files changed, 19 insertions(+), 21 deletions(-) diff --git a/docs/en/sql-reference/window-functions/dense_rank.md b/docs/en/sql-reference/window-functions/dense_rank.md index 17ab894707e..d6445b68c55 100644 --- a/docs/en/sql-reference/window-functions/dense_rank.md +++ b/docs/en/sql-reference/window-functions/dense_rank.md @@ -1,12 +1,12 @@ --- slug: /en/sql-reference/window-functions/dense_rank sidebar_label: dense_rank -sidebar_position: 2 +sidebar_position: 7 --- # dense_rank -This window function ranks the current row within its partition without gaps. In other words, if the value of any new row encountered is equal to the value of one of the previous rows then it will receive the next successive rank without any gaps in ranking. +Ranks the current row within its partition without gaps. In other words, if the value of any new row encountered is equal to the value of one of the previous rows then it will receive the next successive rank without any gaps in ranking. The [rank](./rank.md) function provides the same behaviour, but with gaps in ranking. diff --git a/docs/en/sql-reference/window-functions/lagInFrame.md b/docs/en/sql-reference/window-functions/lagInFrame.md index b67cf252283..049e095c10f 100644 --- a/docs/en/sql-reference/window-functions/lagInFrame.md +++ b/docs/en/sql-reference/window-functions/lagInFrame.md @@ -1,12 +1,12 @@ --- slug: /en/sql-reference/window-functions/lagInFrame sidebar_label: lagInFrame -sidebar_position: 3 +sidebar_position: 8 --- # lagInFrame -Return a value evaluated at the row that is at a specified physical offset before the current row within the ordered frame. The offset parameter, if not specified, defaults to 1, meaning it will fetch the value from the next row. If the calculated row exceeds the boundaries of the window frame, the specified default value is returned. +Returns a value evaluated at the row that is at a specified physical offset row before the current row within the ordered frame. **Syntax** diff --git a/docs/en/sql-reference/window-functions/last_value.md b/docs/en/sql-reference/window-functions/last_value.md index 098ee81ceb3..99b7ca4f75a 100644 --- a/docs/en/sql-reference/window-functions/last_value.md +++ b/docs/en/sql-reference/window-functions/last_value.md @@ -1,17 +1,17 @@ --- -slug: /en/sql-reference/window-functions/lagInFrame -sidebar_label: lagInFrame +slug: /en/sql-reference/window-functions/last_value +sidebar_label: last_value sidebar_position: 4 --- -# first_value +# last_value -Return the last non-NULL value evaluated within its ordered frame. +Returns the last non-NULL value evaluated within its ordered frame. **Syntax** ```sql -first_value (column_name) +last_value (column_name) OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) FROM table_name diff --git a/docs/en/sql-reference/window-functions/leadInFrame.md b/docs/en/sql-reference/window-functions/leadInFrame.md index 33f69c0dcae..fc1b92cc266 100644 --- a/docs/en/sql-reference/window-functions/leadInFrame.md +++ b/docs/en/sql-reference/window-functions/leadInFrame.md @@ -1,12 +1,12 @@ --- slug: /en/sql-reference/window-functions/leadInFrame sidebar_label: leadInFrame -sidebar_position: 5 +sidebar_position: 9 --- # leadInFrame -Return a value evaluated at the row that is offset rows after the current row within the ordered frame. +Returns a value evaluated at the row that is offset rows after the current row within the ordered frame. **Syntax** diff --git a/docs/en/sql-reference/window-functions/nth_value.md b/docs/en/sql-reference/window-functions/nth_value.md index 5c430707009..aa5baf651a8 100644 --- a/docs/en/sql-reference/window-functions/nth_value.md +++ b/docs/en/sql-reference/window-functions/nth_value.md @@ -1,14 +1,12 @@ --- -slug: /en/sql-reference/window-functions/leadInFrame -sidebar_label: leadInFrame -sidebar_position: 6 +slug: /en/sql-reference/window-functions/nth_value +sidebar_label: nth_value +sidebar_position: 5 --- # nth_value -Return the first non-NULL value evaluated against the nth row (offset) in its ordered frame. - -The [dense_rank](./dense_rank.md) function provides the same behaviour but without gaps in ranking. +Returns the first non-NULL value evaluated against the nth row (offset) in its ordered frame. **Syntax** diff --git a/docs/en/sql-reference/window-functions/rank.md b/docs/en/sql-reference/window-functions/rank.md index d7ed8d79c35..dff5e154151 100644 --- a/docs/en/sql-reference/window-functions/rank.md +++ b/docs/en/sql-reference/window-functions/rank.md @@ -1,12 +1,12 @@ --- slug: /en/sql-reference/window-functions/rank sidebar_label: rank -sidebar_position: 7 +sidebar_position: 6 --- # rank -This window function ranks the current row within its partition with gaps. In other words, if the value of any row it encounters is equal to the value of a previous row then it will receive the same rank as that previous row. +Ranks the current row within its partition with gaps. In other words, if the value of any row it encounters is equal to the value of a previous row then it will receive the same rank as that previous row. The rank of the next row is then equal to the rank of the previous row plus a gap equal to the number of times the previous rank was given. The [dense_rank](./dense_rank.md) function provides the same behaviour but without gaps in ranking. diff --git a/docs/en/sql-reference/window-functions/row_number.md b/docs/en/sql-reference/window-functions/row_number.md index 485ca355f12..f1c331f89a3 100644 --- a/docs/en/sql-reference/window-functions/row_number.md +++ b/docs/en/sql-reference/window-functions/row_number.md @@ -1,12 +1,12 @@ --- slug: /en/sql-reference/window-functions/row_number sidebar_label: row_number -sidebar_position: 8 +sidebar_position: 2 --- # row_number -Numbers the current row within its partition starting from 1 +Numbers the current row within its partition starting from 1. **Syntax** From 41633cabb2e055a42db9e8899947358111470cf3 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 10 Jul 2024 22:16:03 +0200 Subject: [PATCH 47/70] Update first_value, last_value with possibility to use RESPECT NULLS --- docs/en/sql-reference/window-functions/first_value.md | 11 +++++++++-- docs/en/sql-reference/window-functions/index.md | 4 ++-- docs/en/sql-reference/window-functions/last_value.md | 11 +++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/window-functions/first_value.md b/docs/en/sql-reference/window-functions/first_value.md index 575a6fc3f48..17ca1cacda8 100644 --- a/docs/en/sql-reference/window-functions/first_value.md +++ b/docs/en/sql-reference/window-functions/first_value.md @@ -6,18 +6,25 @@ sidebar_position: 3 # first_value -Returns the first non-NULL value evaluated within its ordered frame. +Returns the first value evaluated within its ordered frame. By default, NULL arguments are skipped, however the `RESPECT NULLS` modifier can be used to override this behaviour. **Syntax** ```sql -first_value (column_name) +first_value (column_name) [RESPECT NULLS] OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) FROM table_name WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column]) ``` +Alias: `any`. + +:::note +Using the optional modifier `RESPECT NULLS` after `first_value(column_name)` will ensure that `NULL` arguments are not skipped. +See [NULL processing](../aggregate-functions/index.md/#null-processing) for more information. +::: + For more detail on window function syntax see: [Window Functions - Syntax](./index.md/#syntax). **Returned value** diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index 712b99992ea..0c3e2ea1cb6 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -77,8 +77,8 @@ WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column] These functions can be used only as a window function. - [`row_number()`](./row_number.md) - Number the current row within its partition starting from 1. -- [`first_value(x)`](./first_value.md) - Return the first non-NULL value evaluated within its ordered frame. -- [`last_value(x)`](./last_value.md) - Return the last non-NULL value evaluated within its ordered frame. +- [`first_value(x)`](./first_value.md) - Return the first value evaluated within its ordered frame. +- [`last_value(x)`](./last_value.md) - Return the last value evaluated within its ordered frame. - [`nth_value(x, offset)`](./nth_value.md) - Return the first non-NULL value evaluated against the nth row (offset) in its ordered frame. - [`rank()`](./rank.md) - Rank the current row within its partition with gaps. - [`dense_rank()`](./dense_rank.md) - Rank the current row within its partition without gaps. diff --git a/docs/en/sql-reference/window-functions/last_value.md b/docs/en/sql-reference/window-functions/last_value.md index 99b7ca4f75a..9d1ce81cc57 100644 --- a/docs/en/sql-reference/window-functions/last_value.md +++ b/docs/en/sql-reference/window-functions/last_value.md @@ -6,18 +6,25 @@ sidebar_position: 4 # last_value -Returns the last non-NULL value evaluated within its ordered frame. +Returns the last value evaluated within its ordered frame. By default, NULL arguments are skipped, however the `RESPECT NULLS` modifier can be used to override this behaviour. **Syntax** ```sql -last_value (column_name) +last_value (column_name) [RESPECT NULLS] OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) FROM table_name WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column]) ``` +Alias: `anyLast`. + +:::note +Using the optional modifier `RESPECT NULLS` after `first_value(column_name)` will ensure that `NULL` arguments are not skipped. +See [NULL processing](../aggregate-functions/index.md/#null-processing) for more information. +::: + For more detail on window function syntax see: [Window Functions - Syntax](./index.md/#syntax). **Returned value** From 22706b89b9927045e463286c53d82c6369f68bf2 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 11 Jul 2024 00:10:59 +0200 Subject: [PATCH 48/70] Try to fix links in docs --- docs/en/sql-reference/data-types/dynamic.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md index e50f7e6ddaa..8b3c7479f4f 100644 --- a/docs/en/sql-reference/data-types/dynamic.md +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -529,10 +529,10 @@ SELECT JSONExtractKeysAndValues('{"a" : 42, "b" : "Hello", "c" : [1,2,3]}', 'Var ### Binary output format -In [RowBinary](../../interfaces/formats.md#rowbinary-rowbinary) format values of `Dynamic` type are serialized in the following format: +In [RowBinary](/docs/en/interfaces/formats.md#rowbinary-rowbinary) format values of `Dynamic` type are serialized in the following format: ```text ``` -See the [data types binary encoding specification](../../sql-reference/data-types/data-types-binary-encoding.md) +See the [data types binary encoding specification](/docs/en/sql-reference/data-types/data-types-binary-encoding.md) From 3c52651b5580034e5d42433320c0d3de70a15b4e Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Wed, 10 Jul 2024 11:58:45 +0000 Subject: [PATCH 49/70] s3_off_fix: initial (proper ifdef for registerStorageAzureQueue) --- src/Storages/registerStorages.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index 9f849052071..adc1074b1fe 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -35,7 +35,6 @@ void registerStorageFuzzJSON(StorageFactory & factory); void registerStorageS3(StorageFactory & factory); void registerStorageHudi(StorageFactory & factory); void registerStorageS3Queue(StorageFactory & factory); -void registerStorageAzureQueue(StorageFactory & factory); #if USE_PARQUET void registerStorageDeltaLake(StorageFactory & factory); @@ -45,6 +44,10 @@ void registerStorageIceberg(StorageFactory & factory); #endif #endif +#if USE_AZURE_BLOB_STORAGE +void registerStorageAzureQueue(StorageFactory & factory); +#endif + #if USE_HDFS #if USE_HIVE void registerStorageHive(StorageFactory & factory); From 35850da12e0dc775f117ed060f839671306cc26b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:03:03 +0200 Subject: [PATCH 50/70] Update dynamic.md --- docs/en/sql-reference/data-types/dynamic.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md index 8b3c7479f4f..b5781a7dd62 100644 --- a/docs/en/sql-reference/data-types/dynamic.md +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -1,6 +1,6 @@ --- slug: /en/sql-reference/data-types/dynamic -sidebar_position: 56 +sidebar_position: 62 sidebar_label: Dynamic --- From 595bce4945cef4ef1822e610e5352a13e654e45f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 11 Jul 2024 12:06:04 +0200 Subject: [PATCH 51/70] Update docs/en/sql-reference/window-functions/first_value.md --- docs/en/sql-reference/window-functions/first_value.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/window-functions/first_value.md b/docs/en/sql-reference/window-functions/first_value.md index 17ca1cacda8..4f8a9d393b1 100644 --- a/docs/en/sql-reference/window-functions/first_value.md +++ b/docs/en/sql-reference/window-functions/first_value.md @@ -11,7 +11,7 @@ Returns the first value evaluated within its ordered frame. By default, NULL arg **Syntax** ```sql -first_value (column_name) [RESPECT NULLS] +first_value (column_name) [[RESPECT NULLS] | [IGNORE NULLS]] OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) FROM table_name From 366ed8701e0e60bc6ca54258663987d3342d7763 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 11 Jul 2024 12:06:09 +0200 Subject: [PATCH 52/70] Update docs/en/sql-reference/window-functions/first_value.md --- docs/en/sql-reference/window-functions/first_value.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/window-functions/first_value.md b/docs/en/sql-reference/window-functions/first_value.md index 4f8a9d393b1..30c3b1f99dc 100644 --- a/docs/en/sql-reference/window-functions/first_value.md +++ b/docs/en/sql-reference/window-functions/first_value.md @@ -29,7 +29,7 @@ For more detail on window function syntax see: [Window Functions - Syntax](./ind **Returned value** -- The first non-NULL value evaluated within its ordered frame. +- The first value evaluated within its ordered frame. **Example** From 3d96bf298ceaf030c3e863ea2fabd0a6ebe90e4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 11 Jul 2024 12:06:13 +0200 Subject: [PATCH 53/70] Update docs/en/sql-reference/window-functions/last_value.md --- docs/en/sql-reference/window-functions/last_value.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/window-functions/last_value.md b/docs/en/sql-reference/window-functions/last_value.md index 9d1ce81cc57..34170226cdd 100644 --- a/docs/en/sql-reference/window-functions/last_value.md +++ b/docs/en/sql-reference/window-functions/last_value.md @@ -29,7 +29,7 @@ For more detail on window function syntax see: [Window Functions - Syntax](./ind **Returned value** -- The last non-NULL value evaluated within its ordered frame. +- The last value evaluated within its ordered frame. **Example** From 330082c3d4d1ff075e66f2eaf72c1515ad64ffdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 11 Jul 2024 12:06:33 +0200 Subject: [PATCH 54/70] Update docs/en/sql-reference/window-functions/last_value.md --- docs/en/sql-reference/window-functions/last_value.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/window-functions/last_value.md b/docs/en/sql-reference/window-functions/last_value.md index 34170226cdd..dd7f5fa078a 100644 --- a/docs/en/sql-reference/window-functions/last_value.md +++ b/docs/en/sql-reference/window-functions/last_value.md @@ -11,7 +11,7 @@ Returns the last value evaluated within its ordered frame. By default, NULL argu **Syntax** ```sql -last_value (column_name) [RESPECT NULLS] +last_value (column_name) [[RESPECT NULLS] | [IGNORE NULLS]] OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name]) FROM table_name From 050240d89071f750516f2a38fea1909d58095aaa Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 11 Jul 2024 12:08:16 +0200 Subject: [PATCH 55/70] Review fix --- src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp index d37bffc42c4..c896a760597 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp @@ -635,7 +635,7 @@ struct DeltaLakeMetadataImpl } const auto value = tuple[1].safeGet(); auto field = getFieldValue(value, name_and_type->type); - current_partition_columns.emplace_back(*name_and_type, field); + current_partition_columns.emplace_back(std::move(name_and_type.value()), std::move(field)); LOG_TEST(log, "Partition {} value is {} (for {})", partition_name, value, filename); } From 80ceb63f5f194c4c99aa7502f64f7770933ae18f Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 11 Jul 2024 10:45:36 +0000 Subject: [PATCH 56/70] Fixing build. --- src/Common/Exception.cpp | 2 +- src/Common/Exception.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index 07bda6a75be..09ba664baef 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -38,7 +38,7 @@ namespace ErrorCodes extern const int CANNOT_MREMAP; } -[[noreturn]] void abortOnFailedAssertion(const String & description, const Exception::FramePointers * trace = nullptr) +void abortOnFailedAssertion(const String & description, const Exception::FramePointers * trace) { auto & logger = Poco::Logger::root(); LOG_FATAL(&logger, "Logical error: '{}'.", description); diff --git a/src/Common/Exception.h b/src/Common/Exception.h index 87ef7101cdc..68cc305e67e 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -25,8 +25,6 @@ namespace DB class AtomicLogger; -[[noreturn]] void abortOnFailedAssertion(const String & description); - /// This flag can be set for testing purposes - to check that no exceptions are thrown. extern bool terminate_on_any_exception; @@ -167,6 +165,7 @@ protected: mutable std::vector capture_thread_frame_pointers; }; +[[noreturn]] void abortOnFailedAssertion(const String & description, const Exception::FramePointers * trace = nullptr); std::string getExceptionStackTraceString(const std::exception & e); std::string getExceptionStackTraceString(std::exception_ptr e); From 1bc02fb71d6a27a2bc83484dec667edb48b0ab84 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 11 Jul 2024 12:39:53 +0000 Subject: [PATCH 57/70] Ignore subquery for IN in DDLLoadingDependencyVisitor --- src/Databases/DDLLoadingDependencyVisitor.cpp | 8 +++++++ .../02841_not_ready_set_constraints.reference | 1 + .../02841_not_ready_set_constraints.sql | 24 +++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/src/Databases/DDLLoadingDependencyVisitor.cpp b/src/Databases/DDLLoadingDependencyVisitor.cpp index 40234abb20f..67bce915168 100644 --- a/src/Databases/DDLLoadingDependencyVisitor.cpp +++ b/src/Databases/DDLLoadingDependencyVisitor.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -211,6 +212,13 @@ void DDLLoadingDependencyVisitor::extractTableNameFromArgument(const ASTFunction qualified_name.database = table_identifier->getDatabaseName(); qualified_name.table = table_identifier->shortName(); } + else if (arg->as()) + { + /// Allow IN subquery. + /// Do not add tables from the subquery into dependencies, + /// because CREATE will succeed anyway. + return; + } else { assert(false); diff --git a/tests/queries/0_stateless/02841_not_ready_set_constraints.reference b/tests/queries/0_stateless/02841_not_ready_set_constraints.reference index d81cc0710eb..daaac9e3030 100644 --- a/tests/queries/0_stateless/02841_not_ready_set_constraints.reference +++ b/tests/queries/0_stateless/02841_not_ready_set_constraints.reference @@ -1 +1,2 @@ 42 +42 diff --git a/tests/queries/0_stateless/02841_not_ready_set_constraints.sql b/tests/queries/0_stateless/02841_not_ready_set_constraints.sql index ecdf4d50635..274940f50a3 100644 --- a/tests/queries/0_stateless/02841_not_ready_set_constraints.sql +++ b/tests/queries/0_stateless/02841_not_ready_set_constraints.sql @@ -17,3 +17,27 @@ ENGINE = MergeTree ORDER BY conversation; INSERT INTO t2(conversation) VALUES (42); select * from t2; + +drop table t1; + +INSERT INTO t2(conversation) VALUES (42); -- { serverError UNKNOWN_TABLE } + +drop table t2; + +CREATE TABLE t2 ( + `conversation` UInt64, + CONSTRAINT constraint_conversation CHECK conversation IN (SELECT id FROM t1) +) +ENGINE = MergeTree ORDER BY conversation; + +INSERT INTO t2(conversation) VALUES (42); -- { serverError UNKNOWN_TABLE } + +CREATE TABLE t1 ( + `id` UInt64 +) +ENGINE = MergeTree ORDER BY id; + +INSERT INTO t1(id) VALUES (42); + +INSERT INTO t2(conversation) VALUES (42); +select * from t2; From 3e9f6265195ee6952d4963409de3bd4a1f344730 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Thu, 11 Jul 2024 15:59:34 +0200 Subject: [PATCH 58/70] Update test.py --- tests/integration/test_storage_rabbitmq/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index 3240039ee81..f885a3507ac 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -78,13 +78,13 @@ def wait_rabbitmq_to_start(rabbitmq_docker_id, cookie, timeout=180): def kill_rabbitmq(rabbitmq_id): p = subprocess.Popen(("docker", "stop", rabbitmq_id), stdout=subprocess.PIPE) - p.communicate() + p.wait(timeout=60) return p.returncode == 0 def revive_rabbitmq(rabbitmq_id, cookie): p = subprocess.Popen(("docker", "start", rabbitmq_id), stdout=subprocess.PIPE) - p.communicate() + p.wait(timeout=60) wait_rabbitmq_to_start(rabbitmq_id, cookie) From 262c1f9e77add2349c76777d9ddabfe06895c6d3 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:03:13 +0200 Subject: [PATCH 59/70] Update dynamic.md --- docs/en/sql-reference/data-types/dynamic.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md index b5781a7dd62..d0116e7158c 100644 --- a/docs/en/sql-reference/data-types/dynamic.md +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -529,7 +529,7 @@ SELECT JSONExtractKeysAndValues('{"a" : 42, "b" : "Hello", "c" : [1,2,3]}', 'Var ### Binary output format -In [RowBinary](/docs/en/interfaces/formats.md#rowbinary-rowbinary) format values of `Dynamic` type are serialized in the following format: +In [RowBinary](/docs/en/interfaces/formats.md#rowbinary) format values of `Dynamic` type are serialized in the following format: ```text From 6868708a58f39fac83382695864459eb5fcffe5b Mon Sep 17 00:00:00 2001 From: Max Kainov Date: Thu, 11 Jul 2024 11:37:26 +0000 Subject: [PATCH 60/70] CI Buddy bot to notify about CI events --- pyproject.toml | 3 +- tests/ci/.mypy.ini | 1 + tests/ci/ci.py | 11 +++++- tests/ci/ci_buddy.py | 88 ++++++++++++++++++++++++++++++++++++++++++++ tests/ci/ci_utils.py | 41 +++++++++++++++++++++ 5 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 tests/ci/ci_buddy.py diff --git a/pyproject.toml b/pyproject.toml index 279d077a695..90f089afa41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,10 +35,9 @@ disable = ''' broad-except, bare-except, no-else-return, - global-statement + global-statement, ''' [tool.pylint.SIMILARITIES] # due to SQL min-similarity-lines=1000 - diff --git a/tests/ci/.mypy.ini b/tests/ci/.mypy.ini index 9bc44025826..f12d27979ce 100644 --- a/tests/ci/.mypy.ini +++ b/tests/ci/.mypy.ini @@ -15,3 +15,4 @@ warn_return_any = True no_implicit_reexport = True strict_equality = True extra_checks = True +ignore_missing_imports = True \ No newline at end of file diff --git a/tests/ci/ci.py b/tests/ci/ci.py index af2f4c0a1fc..b4a3c7ec849 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -15,7 +15,7 @@ import upload_result_helper from build_check import get_release_or_pr from ci_config import CI from ci_metadata import CiMetadata -from ci_utils import GHActions, normalize_string +from ci_utils import GHActions, normalize_string, Shell from clickhouse_helper import ( CiLogsCredentials, ClickHouseHelper, @@ -53,6 +53,7 @@ from stopwatch import Stopwatch from tee_popen import TeePopen from ci_cache import CiCache from ci_settings import CiSettings +from ci_buddy import CIBuddy from version_helper import get_version_from_repo # pylint: disable=too-many-lines @@ -262,6 +263,8 @@ def check_missing_images_on_dockerhub( def _pre_action(s3, indata, pr_info): + print("Clear dmesg") + Shell.run("sudo dmesg --clear ||:") CommitStatusData.cleanup() JobReport.cleanup() BuildResult.cleanup() @@ -1118,6 +1121,12 @@ def main() -> int: ### POST action: start elif args.post: + if Shell.check( + "sudo dmesg -T | grep -q -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE'" + ): + print("WARNING: OOM while job execution") + CIBuddy().post_error("Out Of Memory") + job_report = JobReport.load() if JobReport.exist() else None if job_report: ch_helper = ClickHouseHelper() diff --git a/tests/ci/ci_buddy.py b/tests/ci/ci_buddy.py new file mode 100644 index 00000000000..d03f5d819ec --- /dev/null +++ b/tests/ci/ci_buddy.py @@ -0,0 +1,88 @@ +import json +import os + +import boto3 +import requests +from botocore.exceptions import ClientError + +from pr_info import PRInfo +from ci_utils import Shell + + +class CIBuddy: + _HEADERS = {"Content-Type": "application/json"} + + def __init__(self, dry_run=False): + self.repo = os.getenv("GITHUB_REPOSITORY", "") + self.dry_run = dry_run + res = self._get_webhooks() + self.test_channel = "" + self.dev_ci_channel = "" + if res: + self.test_channel = json.loads(res)["test_channel"] + self.dev_ci_channel = json.loads(res)["ci_channel"] + self.job_name = os.getenv("CHECK_NAME", "unknown") + pr_info = PRInfo() + self.pr_number = pr_info.number + self.head_ref = pr_info.head_ref + self.commit_url = pr_info.commit_html_url + + @staticmethod + def _get_webhooks(): + name = "ci_buddy_web_hooks" + + session = boto3.Session(region_name="us-east-1") # Replace with your region + ssm_client = session.client("ssm") + json_string = None + try: + response = ssm_client.get_parameter( + Name=name, + WithDecryption=True, # Set to True if the parameter is a SecureString + ) + json_string = response["Parameter"]["Value"] + except ClientError as e: + print(f"An error occurred: {e}") + + return json_string + + def post(self, message, dry_run=None): + if dry_run is None: + dry_run = self.dry_run + print(f"Posting slack message, dry_run [{dry_run}]") + if dry_run: + url = self.test_channel + else: + url = self.dev_ci_channel + data = {"text": message} + try: + requests.post(url, headers=self._HEADERS, data=json.dumps(data), timeout=10) + except Exception as e: + print(f"ERROR: Failed to post message, ex {e}") + + def post_error(self, error_description, job_name="", with_instance_info=True): + instance_id, instance_type = "unknown", "unknown" + if with_instance_info: + instance_id = Shell.run("ec2metadata --instance-id") or instance_id + instance_type = Shell.run("ec2metadata --instance-type") or instance_type + if not job_name: + job_name = os.getenv("CHECK_NAME", "unknown") + line_err = f":red_circle: {error_description} :red_circle:\n\n" + line_ghr = f" *Runner:* `{instance_type}`, `{instance_id}`\n" + line_job = f" *Job:* `{job_name}`\n" + line_pr_ = f" *PR:* \n" + line_br_ = f" *Branch:* `{self.head_ref}`, <{self.commit_url}|commit>\n" + message = line_err + message += line_job + if with_instance_info: + message += line_ghr + if self.pr_number > 0: + message += line_pr_ + else: + message += line_br_ + self.post(message) + + +if __name__ == "__main__": + # test + buddy = CIBuddy(dry_run=True) + buddy.post_error("Out of memory") diff --git a/tests/ci/ci_utils.py b/tests/ci/ci_utils.py index e7034d0b104..629f37289a9 100644 --- a/tests/ci/ci_utils.py +++ b/tests/ci/ci_utils.py @@ -1,4 +1,5 @@ import os +import subprocess from contextlib import contextmanager from pathlib import Path from typing import Any, Iterator, List, Union @@ -42,3 +43,43 @@ class GHActions: for line in lines: print(line) print("::endgroup::") + + +class Shell: + @classmethod + def run_strict(cls, command): + subprocess.run( + command + " 2>&1", + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + ) + + @classmethod + def run(cls, command): + res = "" + result = subprocess.run( + command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + if result.returncode == 0: + res = result.stdout + return res.strip() + + @classmethod + def check(cls, command): + result = subprocess.run( + command + " 2>&1", + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + return result.returncode == 0 From 808d875a760d81792691f4cd7c465ec2823aefa9 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 11 Jul 2024 18:57:54 +0200 Subject: [PATCH 61/70] Remove links at all --- docs/en/sql-reference/data-types/dynamic.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md index d0116e7158c..8be81471377 100644 --- a/docs/en/sql-reference/data-types/dynamic.md +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -529,10 +529,8 @@ SELECT JSONExtractKeysAndValues('{"a" : 42, "b" : "Hello", "c" : [1,2,3]}', 'Var ### Binary output format -In [RowBinary](/docs/en/interfaces/formats.md#rowbinary) format values of `Dynamic` type are serialized in the following format: +In RowBinary format values of `Dynamic` type are serialized in the following format: ```text ``` - -See the [data types binary encoding specification](/docs/en/sql-reference/data-types/data-types-binary-encoding.md) From fe451ec25a3baa269cb47722e38dfd90f01b3734 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 11 Jul 2024 17:34:33 +0000 Subject: [PATCH 62/70] Fixing build. --- src/Common/Exception.cpp | 12 +++++++++--- src/Common/Exception.h | 3 ++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index 09ba664baef..111280074dd 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -38,15 +38,21 @@ namespace ErrorCodes extern const int CANNOT_MREMAP; } -void abortOnFailedAssertion(const String & description, const Exception::FramePointers * trace) +void abortOnFailedAssertion(const String & description, void * const * trace, size_t trace_offset, size_t trace_size) { auto & logger = Poco::Logger::root(); LOG_FATAL(&logger, "Logical error: '{}'.", description); if (trace) - LOG_FATAL(&logger, "Stack trace (when copying this message, always include the lines below):\n\n{}", StackTrace::toString(trace->data(), 0, trace->size())); + LOG_FATAL(&logger, "Stack trace (when copying this message, always include the lines below):\n\n{}", StackTrace::toString(trace, trace_offset, trace_size)); abort(); } +void abortOnFailedAssertion(const String & description) +{ + StackTrace st; + abortOnFailedAssertion(description, st.getFramePointers().data(), st.getOffset(), st.getSize()); +} + bool terminate_on_any_exception = false; static int terminate_status_code = 128 + SIGABRT; thread_local bool update_error_statistics = true; @@ -61,7 +67,7 @@ void handle_error_code(const std::string & msg, int code, bool remote, const Exc #ifdef ABORT_ON_LOGICAL_ERROR if (code == ErrorCodes::LOGICAL_ERROR) { - abortOnFailedAssertion(msg, &trace); + abortOnFailedAssertion(msg, trace.data(), 0, trace.size()); } #endif diff --git a/src/Common/Exception.h b/src/Common/Exception.h index 68cc305e67e..a4774a89f6a 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -165,7 +165,8 @@ protected: mutable std::vector capture_thread_frame_pointers; }; -[[noreturn]] void abortOnFailedAssertion(const String & description, const Exception::FramePointers * trace = nullptr); +[[noreturn]] void abortOnFailedAssertion(const String & description, void * const * trace, size_t trace_offset, size_t trace_size); +[[noreturn]] void abortOnFailedAssertion(const String & description); std::string getExceptionStackTraceString(const std::exception & e); std::string getExceptionStackTraceString(std::exception_ptr e); From 8ef3fbf32333dea0be9ee3ebbd9a3c9529cb9fb6 Mon Sep 17 00:00:00 2001 From: Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com> Date: Fri, 12 Jul 2024 07:50:35 +0700 Subject: [PATCH 63/70] docs(clickhouse-local): intended section link --- docs/en/operations/utilities/clickhouse-local.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/utilities/clickhouse-local.md b/docs/en/operations/utilities/clickhouse-local.md index f19643a3fa5..c20e4fc3b09 100644 --- a/docs/en/operations/utilities/clickhouse-local.md +++ b/docs/en/operations/utilities/clickhouse-local.md @@ -16,7 +16,7 @@ sidebar_label: clickhouse-local While `clickhouse-local` is a great tool for development and testing purposes, and for processing files, it is not suitable for serving end users or applications. In these scenarios, it is recommended to use the open-source [ClickHouse](https://clickhouse.com/docs/en/install). ClickHouse is a powerful OLAP database that is designed to handle large-scale analytical workloads. It provides fast and efficient processing of complex queries on large datasets, making it ideal for use in production environments where high-performance is critical. Additionally, ClickHouse offers a wide range of features such as replication, sharding, and high availability, which are essential for scaling up to handle large datasets and serving applications. If you need to handle larger datasets or serve end users or applications, we recommend using open-source ClickHouse instead of `clickhouse-local`. -Please read the docs below that show example use cases for `clickhouse-local`, such as [querying local CSVs](#query-data-in-a-csv-file-using-sql) or [reading a parquet file in S3](#query-data-in-a-parquet-file-in-aws-s3). +Please read the docs below that show example use cases for `clickhouse-local`, such as [querying local file](#query_data_in_file) or [reading a parquet file in S3](#query-data-in-a-parquet-file-in-aws-s3). ## Download clickhouse-local From b024bb736f6008de741bf6392d0b8432e8cdf16c Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 12 Jul 2024 10:14:41 +0200 Subject: [PATCH 64/70] CI: CiBuddy to post to salck channel from release branches only --- tests/ci/ci.py | 4 +++- tests/ci/ci_buddy.py | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/ci/ci.py b/tests/ci/ci.py index b4a3c7ec849..fac50d30022 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -1125,7 +1125,9 @@ def main() -> int: "sudo dmesg -T | grep -q -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE'" ): print("WARNING: OOM while job execution") - CIBuddy().post_error("Out Of Memory") + CIBuddy(dry_run=not pr_info.is_release).post_error( + "Out Of Memory", job_name=_get_ext_check_name(args.job_name) + ) job_report = JobReport.load() if JobReport.exist() else None if job_report: diff --git a/tests/ci/ci_buddy.py b/tests/ci/ci_buddy.py index d03f5d819ec..ea690bb602c 100644 --- a/tests/ci/ci_buddy.py +++ b/tests/ci/ci_buddy.py @@ -66,11 +66,11 @@ class CIBuddy: instance_type = Shell.run("ec2metadata --instance-type") or instance_type if not job_name: job_name = os.getenv("CHECK_NAME", "unknown") - line_err = f":red_circle: {error_description} :red_circle:\n\n" - line_ghr = f" *Runner:* `{instance_type}`, `{instance_id}`\n" - line_job = f" *Job:* `{job_name}`\n" - line_pr_ = f" *PR:* \n" - line_br_ = f" *Branch:* `{self.head_ref}`, <{self.commit_url}|commit>\n" + line_err = f":red_circle: *Error: {error_description}*\n\n" + line_ghr = f" *Runner:* `{instance_type}`, `{instance_id}`\n" + line_job = f" *Job:* `{job_name}`\n" + line_pr_ = f" *PR:* \n" + line_br_ = f" *Branch:* `{self.head_ref}`, <{self.commit_url}|commit>\n" message = line_err message += line_job if with_instance_info: From 7d9e1700d2a85330380c19777b788ce5a6c2f605 Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 12 Jul 2024 11:24:24 +0200 Subject: [PATCH 65/70] update intHash32, intHash64 --- .../functions/array-functions.md | 40 ++++++++++++ .../sql-reference/functions/hash-functions.md | 63 ++++++++++++++++++- 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index d87ca4a0fe7..4080dce883f 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -3081,3 +3081,43 @@ Result: ## Distance functions All supported functions are described in [distance functions documentation](../../sql-reference/functions/distance-functions.md). + +## kql_array_sort_asc + +Sorts an array from lowest to highest value. For use with [Kusto Query Language (KQL)](https://clickhouse.com/docs/en/guides/developer/alternative-query-languages#kusto-query-language-kql). + +:::note +For this function to work you should have Kusto enabled. To enable Kusto: + +```sql +SET dialect = 'kusto' +``` + +::: + +**Syntax** + +``` sql +kql_array_sort_asc(arr1 [, arr2 ... arrN]) +``` + +**Arguments** + +- `arr1` — [Array](../data-types/array.md) of numeric values. +- `arr1` — [Array](../data-types/array.md) of numeric values. + +**Returned value** + +- Returns an array of non-negative partial sums of elements in the source array. [UInt\*](https://clickhouse.com/docs/en/data_types/int_uint/#uint-ranges), [Int\*](https://clickhouse.com/docs/en/data_types/int_uint/#int-ranges), [Float\*](https://clickhouse.com/docs/en/data_types/float/). + +**Example** + +``` sql +SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res +``` + +``` text +┌─res───────┐ +│ [1,2,0,1] │ +└───────────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index e431ed75465..d2ed4516fce 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -314,10 +314,71 @@ SELECT groupBitXor(cityHash64(*)) FROM table Calculates a 32-bit hash code from any type of integer. This is a relatively fast non-cryptographic hash function of average quality for numbers. +**Syntax** + +```sql +intHash32(int) +``` + +**Arguments** + +- `int` — Integer to hash. [(U)Int*](../data-types/int-uint.md). + +**Returned value** + +- 32-bit hash code. [UInt32](../data-types/int-uint.md). + +**Example** + +Query: + +```sql +SELECT intHash32(42); +``` + +Result: + +```response +┌─intHash32(42)─┐ +│ 1228623923 │ +└───────────────┘ +``` + ## intHash64 Calculates a 64-bit hash code from any type of integer. -It works faster than intHash32. Average quality. +This is a relatively fast non-cryptographic hash function of average quality for numbers. +It works faster than [intHash32](#inthash32). + +**Syntax** + +```sql +intHash32(int) +``` + +**Arguments** + +- `int` — Integer to hash. [(U)Int*](../data-types/int-uint.md). + +**Returned value** + +- 64-bit hash code. [UInt64](../data-types/int-uint.md). + +**Example** + +Query: + +```sql +SELECT intHash64(42); +``` + +Result: + +```response +┌────────intHash64(42)─┐ +│ 11490350930367293593 │ +└──────────────────────┘ +``` ## SHA1, SHA224, SHA256, SHA512, SHA512_256 From 18e411d35366a82e3a2c9a725ccfabfa6a9170b6 Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 12 Jul 2024 11:28:41 +0200 Subject: [PATCH 66/70] remove unwanted change --- .../functions/array-functions.md | 42 +------------------ 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 4080dce883f..1b52440903d 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -3080,44 +3080,4 @@ Result: ## Distance functions -All supported functions are described in [distance functions documentation](../../sql-reference/functions/distance-functions.md). - -## kql_array_sort_asc - -Sorts an array from lowest to highest value. For use with [Kusto Query Language (KQL)](https://clickhouse.com/docs/en/guides/developer/alternative-query-languages#kusto-query-language-kql). - -:::note -For this function to work you should have Kusto enabled. To enable Kusto: - -```sql -SET dialect = 'kusto' -``` - -::: - -**Syntax** - -``` sql -kql_array_sort_asc(arr1 [, arr2 ... arrN]) -``` - -**Arguments** - -- `arr1` — [Array](../data-types/array.md) of numeric values. -- `arr1` — [Array](../data-types/array.md) of numeric values. - -**Returned value** - -- Returns an array of non-negative partial sums of elements in the source array. [UInt\*](https://clickhouse.com/docs/en/data_types/int_uint/#uint-ranges), [Int\*](https://clickhouse.com/docs/en/data_types/int_uint/#int-ranges), [Float\*](https://clickhouse.com/docs/en/data_types/float/). - -**Example** - -``` sql -SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res -``` - -``` text -┌─res───────┐ -│ [1,2,0,1] │ -└───────────┘ -``` \ No newline at end of file +All supported functions are described in [distance functions documentation](../../sql-reference/functions/distance-functions.md). \ No newline at end of file From 7c6db58eec7d06ae216774b957df992c05e94454 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Fri, 12 Jul 2024 11:34:04 +0200 Subject: [PATCH 67/70] Update aspell-dict.txt --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 78c4b6bde95..ca2c4ec4192 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1900,11 +1900,13 @@ kurtosis kurtpop kurtsamp laion +lagInFrame lang laravel largestTriangleThreeBuckets latencies ldap +leadInFrame leftPad leftPadUTF leftUTF From d9a05bca89f3578bb2cf965b1ce373657de0474a Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 12 Jul 2024 11:43:04 +0200 Subject: [PATCH 68/70] add alias to anyLast_respect_nulls --- .../aggregate-functions/reference/anylast_respect_nulls.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en/sql-reference/aggregate-functions/reference/anylast_respect_nulls.md b/docs/en/sql-reference/aggregate-functions/reference/anylast_respect_nulls.md index 8f093cfdb61..a28b965f7ea 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/anylast_respect_nulls.md +++ b/docs/en/sql-reference/aggregate-functions/reference/anylast_respect_nulls.md @@ -13,6 +13,8 @@ Selects the last value encountered, irregardless of whether it is `NULL` or not. anyLast_respect_nulls(column) ``` +Alias: `last_value_respect_nulls`. + **Parameters** - `column`: The column name. From 3806ab7ef1ed8f298cbe0a1d3b186a0d29e7d3a6 Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 12 Jul 2024 12:50:19 +0200 Subject: [PATCH 69/70] remove *_respect_nulls and modify any, anyLast to reflect that they can use modifier RESPECT NULLS --- .../aggregate-functions/index.md | 2 +- .../aggregate-functions/reference/any.md | 8 ++-- .../reference/any_respect_nulls.md | 44 ------------------- .../aggregate-functions/reference/anylast.md | 8 +++- .../reference/anylast_respect_nulls.md | 41 ----------------- .../aggregate-functions/reference/index.md | 3 +- 6 files changed, 13 insertions(+), 93 deletions(-) delete mode 100644 docs/en/sql-reference/aggregate-functions/reference/any_respect_nulls.md delete mode 100644 docs/en/sql-reference/aggregate-functions/reference/anylast_respect_nulls.md diff --git a/docs/en/sql-reference/aggregate-functions/index.md b/docs/en/sql-reference/aggregate-functions/index.md index 96bf0c5d93b..5056ef2c7aa 100644 --- a/docs/en/sql-reference/aggregate-functions/index.md +++ b/docs/en/sql-reference/aggregate-functions/index.md @@ -18,7 +18,7 @@ ClickHouse also supports: During aggregation, all `NULL` arguments are skipped. If the aggregation has several arguments it will ignore any row in which one or more of them are NULL. -There is an exception to this rule, which are the functions [`first_value`](../../sql-reference/aggregate-functions/reference/first_value.md), [`last_value`](../../sql-reference/aggregate-functions/reference/last_value.md) and their aliases when followed by the modifier `RESPECT NULLS`: `FIRST_VALUE(b) RESPECT NULLS`. +There is an exception to this rule, which are the functions [`first_value`](../../sql-reference/aggregate-functions/reference/first_value.md), [`last_value`](../../sql-reference/aggregate-functions/reference/last_value.md) and their aliases (`any` and `anyLast` respectively) when followed by the modifier `RESPECT NULLS`. For example, `FIRST_VALUE(b) RESPECT NULLS`. **Examples:** diff --git a/docs/en/sql-reference/aggregate-functions/reference/any.md b/docs/en/sql-reference/aggregate-functions/reference/any.md index cdff7dde4a9..972263585f2 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/any.md +++ b/docs/en/sql-reference/aggregate-functions/reference/any.md @@ -5,12 +5,12 @@ sidebar_position: 102 # any -Selects the first encountered value of a column. +Selects the first encountered value of a column, ignoring any `NULL` values. **Syntax** ```sql -any(column) +any(column) [RESPECT NULLS] ``` Aliases: `any_value`, [`first_value`](../reference/first_value.md). @@ -20,7 +20,9 @@ Aliases: `any_value`, [`first_value`](../reference/first_value.md). **Returned value** -By default, it ignores NULL values and returns the first NOT NULL value found in the column. Like [`first_value`](../../../sql-reference/aggregate-functions/reference/first_value.md) it supports `RESPECT NULLS`, in which case it will select the first value passed, independently on whether it's NULL or not. +:::note +Supports the `RESPECT NULLS` modifier after the function name. Using this modifier will ensure the function selects the first value passed, regardless of whether it is `NULL` or not. +::: :::note The return type of the function is the same as the input, except for LowCardinality which is discarded. This means that given no rows as input it will return the default value of that type (0 for integers, or Null for a Nullable() column). You might use the `-OrNull` [combinator](../../../sql-reference/aggregate-functions/combinators.md) ) to modify this behaviour. diff --git a/docs/en/sql-reference/aggregate-functions/reference/any_respect_nulls.md b/docs/en/sql-reference/aggregate-functions/reference/any_respect_nulls.md deleted file mode 100644 index 99104a9b8c7..00000000000 --- a/docs/en/sql-reference/aggregate-functions/reference/any_respect_nulls.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -slug: /en/sql-reference/aggregate-functions/reference/any_respect_nulls -sidebar_position: 103 ---- - -# any_respect_nulls - -Selects the first encountered value of a column, irregardless of whether it is a `NULL` value or not. - -Alias: `any_value_respect_nulls`, `first_value_repect_nulls`. - -**Syntax** - -```sql -any_respect_nulls(column) -``` - -**Parameters** -- `column`: The column name. - -**Returned value** - -- The last value encountered, irregardless of whether it is a `NULL` value or not. - -**Example** - -Query: - -```sql -CREATE TABLE any_nulls (city Nullable(String)) ENGINE=Log; - -INSERT INTO any_nulls (city) VALUES (NULL), ('Amsterdam'), ('New York'), ('Tokyo'), ('Valencia'), (NULL); - -SELECT any(city), any_respect_nulls(city) FROM any_nulls; -``` - -```response -┌─any(city)─┬─any_respect_nulls(city)─┐ -│ Amsterdam │ ᴺᵁᴸᴸ │ -└───────────┴─────────────────────────┘ -``` - -**See Also** -- [any](../reference/any.md) diff --git a/docs/en/sql-reference/aggregate-functions/reference/anylast.md b/docs/en/sql-reference/aggregate-functions/reference/anylast.md index e43bc07fbdc..202d2e9fb10 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/anylast.md +++ b/docs/en/sql-reference/aggregate-functions/reference/anylast.md @@ -5,17 +5,21 @@ sidebar_position: 105 # anyLast -Selects the last value encountered. The result is just as indeterminate as for the [any](../../../sql-reference/aggregate-functions/reference/any.md) function. +Selects the last value encountered, ignoring any `NULL` values by default. The result is just as indeterminate as for the [any](../../../sql-reference/aggregate-functions/reference/any.md) function. **Syntax** ```sql -anyLast(column) +anyLast(column) [RESPECT NULLS] ``` **Parameters** - `column`: The column name. +:::note +Supports the `RESPECT NULLS` modifier after the function name. Using this modifier will ensure the function selects the first value passed, regardless of whether it is `NULL` or not. +::: + **Returned value** - The last value encountered. diff --git a/docs/en/sql-reference/aggregate-functions/reference/anylast_respect_nulls.md b/docs/en/sql-reference/aggregate-functions/reference/anylast_respect_nulls.md deleted file mode 100644 index a28b965f7ea..00000000000 --- a/docs/en/sql-reference/aggregate-functions/reference/anylast_respect_nulls.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -slug: /en/sql-reference/aggregate-functions/reference/anylast_respect_nulls -sidebar_position: 106 ---- - -# anyLast_respect_nulls - -Selects the last value encountered, irregardless of whether it is `NULL` or not. - -**Syntax** - -```sql -anyLast_respect_nulls(column) -``` - -Alias: `last_value_respect_nulls`. - -**Parameters** -- `column`: The column name. - -**Returned value** - -- The last value encountered, irregardless of whether it is `NULL` or not. - -**Example** - -Query: - -```sql -CREATE TABLE any_last_nulls (city Nullable(String)) ENGINE=Log; - -INSERT INTO any_last_nulls (city) VALUES ('Amsterdam'),(NULL),('New York'),('Tokyo'),('Valencia'),(NULL); - -SELECT anyLast(city), anyLast_respect_nulls(city) FROM any_last_nulls; -``` - -```response -┌─anyLast(city)─┬─anyLast_respect_nulls(city)─┐ -│ Valencia │ ᴺᵁᴸᴸ │ -└───────────────┴─────────────────────────────┘ -``` \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/reference/index.md b/docs/en/sql-reference/aggregate-functions/reference/index.md index e3725b6a430..323a99d276f 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/index.md +++ b/docs/en/sql-reference/aggregate-functions/reference/index.md @@ -44,10 +44,9 @@ Standard aggregate functions: ClickHouse-specific aggregate functions: - [analysisOfVariance](../reference/analysis_of_variance.md) -- [any](../reference/any_respect_nulls.md) +- [any](../reference/any.md) - [anyHeavy](../reference/anyheavy.md) - [anyLast](../reference/anylast.md) -- [anyLast](../reference/anylast_respect_nulls.md) - [boundingRatio](../reference/boundrat.md) - [first_value](../reference/first_value.md) - [last_value](../reference/last_value.md) From 633db10d397a030b9e0f5aa4435fe5d6c002b54f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 12 Jul 2024 12:58:14 +0200 Subject: [PATCH 70/70] Update docs/en/sql-reference/functions/hash-functions.md --- docs/en/sql-reference/functions/hash-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index d2ed4516fce..7c977e7d6dc 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -353,7 +353,7 @@ It works faster than [intHash32](#inthash32). **Syntax** ```sql -intHash32(int) +intHash64(int) ``` **Arguments**