From 68cdfbcc6c95ae601c9788eb75104e8fee896fbf Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 11 Sep 2023 16:03:03 +0000 Subject: [PATCH 1/7] Refactorings and cleanups (semantics did not change) --- src/Functions/FunctionHelpers.cpp | 2 +- src/Functions/FunctionsStringArray.cpp | 18 +- src/Functions/FunctionsStringArray.h | 246 ++++++------------ src/Functions/URL/URLHierarchy.cpp | 25 +- src/Functions/URL/URLPathHierarchy.cpp | 25 +- .../URL/extractURLParameterNames.cpp | 24 +- src/Functions/URL/extractURLParameters.cpp | 23 +- 7 files changed, 121 insertions(+), 242 deletions(-) diff --git a/src/Functions/FunctionHelpers.cpp b/src/Functions/FunctionHelpers.cpp index 7a9817ad344..6c3e438dea7 100644 --- a/src/Functions/FunctionHelpers.cpp +++ b/src/Functions/FunctionHelpers.cpp @@ -105,7 +105,7 @@ void validateArgumentType(const IFunction & func, const DataTypes & arguments, const auto & argument = arguments[argument_index]; if (!validator_func(*argument)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of {} argument of function {} expected {}", + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of {} argument of function {}, expected {}", argument->getName(), std::to_string(argument_index), func.getName(), expected_type_description); } diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp index e7519068f44..4d118481bb2 100644 --- a/src/Functions/FunctionsStringArray.cpp +++ b/src/Functions/FunctionsStringArray.cpp @@ -9,19 +9,17 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } -DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const DataTypes & arguments) const +DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const { - if (arguments.size() != 1 && arguments.size() != 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be 1 or 2.", - getName(), arguments.size()); + FunctionArgumentDescriptors mandatory_args{ + {"arr", &isArray, nullptr, "Array"}, + }; - const DataTypeArray * array_type = checkAndGetDataType(arguments[0].get()); - if (!array_type) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be an array.", getName()); + FunctionArgumentDescriptors optional_args{ + {"separator", &isString, isColumnConst, "const String"}, + }; - if (arguments.size() == 2 && !isString(arguments[1])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument for function {} must be constant string.", getName()); + validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); return std::make_shared(); } diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 8d41789b556..ce78090dc6b 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -66,7 +66,6 @@ private: Pos end; public: - /// Get the name of the function. static constexpr auto name = "alphaTokens"; static String getName() { return name; } @@ -74,18 +73,22 @@ public: static size_t getNumberOfArguments() { return 0; } - /// Check the type of the function's arguments. - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - if (arguments.empty() || arguments.size() > 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes one or two arguments", getName()); + FunctionArgumentDescriptors mandatory_args{ + {"s", &isString, nullptr, "String"}, + }; - if (!isString(arguments[0])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. " - "Must be String.", arguments[0]->getName(), getName()); + FunctionArgumentDescriptors optional_args{ + {"max_substrings", &isNativeInteger, isColumnConst, "const Number"}, + }; + + validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args); } - /// Initialize by the function arguments. + static constexpr auto strings_argument_position = 0uz; + static constexpr auto max_substrings_argument_position = std::make_optional(1); + void init(const ColumnsWithTypeAndName & /*arguments*/) {} /// Called for each next string. @@ -95,18 +98,6 @@ public: end = end_; } - /// Returns the position of the argument, that is the column of strings - static size_t getStringsArgumentPosition() - { - return 0; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return 1; - } - /// Get the next token, if any, or return false. bool get(Pos & token_begin, Pos & token_end) { @@ -142,18 +133,14 @@ public: static bool isVariadic() { return true; } static size_t getNumberOfArguments() { return 0; } - /// Check the type of the function's arguments. - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - if (arguments.empty() || arguments.size() > 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes one or two arguments", getName()); - - if (!isString(arguments[0])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. " - "Must be String.", arguments[0]->getName(), getName()); + SplitByAlphaImpl::checkArguments(func, arguments); } - /// Initialize by the function arguments. + static constexpr auto strings_argument_position = 0uz; + static constexpr auto max_substrings_argument_position = std::make_optional(1); + void init(const ColumnsWithTypeAndName & /*arguments*/) {} /// Called for each next string. @@ -163,18 +150,6 @@ public: end = end_; } - /// Returns the position of the argument, that is the column of strings - static size_t getStringsArgumentPosition() - { - return 0; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return 1; - } - /// Get the next token, if any, or return false. bool get(Pos & token_begin, Pos & token_end) { @@ -203,25 +178,20 @@ private: Pos end; public: - /// Get the name of the function. static constexpr auto name = "splitByWhitespace"; static String getName() { return name; } static bool isVariadic() { return true; } static size_t getNumberOfArguments() { return 0; } - /// Check the type of the function's arguments. - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - if (arguments.empty() || arguments.size() > 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes one or two arguments", getName()); - - if (!isString(arguments[0])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. " - "Must be String.", arguments[0]->getName(), getName()); + return SplitByNonAlphaImpl::checkArguments(func, arguments); } - /// Initialize by the function arguments. + static constexpr auto strings_argument_position = 0uz; + static constexpr auto max_substrings_argument_position = std::make_optional(1); + void init(const ColumnsWithTypeAndName & /*arguments*/) {} /// Called for each next string. @@ -231,18 +201,6 @@ public: end = end_; } - /// Returns the position of the argument, that is the column of strings - static size_t getStringsArgumentPosition() - { - return 0; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return 1; - } - /// Get the next token, if any, or return false. bool get(Pos & token_begin, Pos & token_end) { @@ -269,7 +227,7 @@ class SplitByCharImpl private: Pos pos; Pos end; - char sep; + char separator; public: static constexpr auto name = "splitByChar"; @@ -277,23 +235,23 @@ public: static bool isVariadic() { return true; } static size_t getNumberOfArguments() { return 0; } - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - if (arguments.size() < 2 || arguments.size() > 3) - throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Function '{}' needs at least 2 arguments, at most 3 arguments; passed {}.", - name, arguments.size()); + FunctionArgumentDescriptors mandatory_args{ + {"separator", &isString, isColumnConst, "const String"}, + {"s", &isString, nullptr, "String"} + }; - if (!isString(arguments[0])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. " - "Must be String.", arguments[0]->getName(), getName()); + FunctionArgumentDescriptors optional_args{ + {"max_substrings", &isNativeInteger, isColumnConst, "const Number"}, + }; - if (!isString(arguments[1])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of second argument of function {}. " - "Must be String.", arguments[1]->getName(), getName()); + validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args); } + static constexpr auto strings_argument_position = 1uz; + static constexpr auto max_substrings_argument_position = std::make_optional(2); + void init(const ColumnsWithTypeAndName & arguments) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -307,19 +265,7 @@ public: if (sep_str.size() != 1) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal separator for function {}. Must be exactly one byte.", getName()); - sep = sep_str[0]; - } - - /// Returns the position of the argument, that is the column of strings - static size_t getStringsArgumentPosition() - { - return 1; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return 2; + separator = sep_str[0]; } void set(Pos pos_, Pos end_) @@ -334,7 +280,7 @@ public: return false; token_begin = pos; - pos = reinterpret_cast(memchr(pos, sep, end - pos)); + pos = reinterpret_cast(memchr(pos, separator, end - pos)); if (pos) { @@ -355,7 +301,7 @@ private: Pos pos; Pos end; - String sep; + String separator; public: static constexpr auto name = "splitByString"; @@ -363,11 +309,14 @@ public: static bool isVariadic() { return true; } static size_t getNumberOfArguments() { return 0; } - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - SplitByCharImpl::checkArguments(arguments); + SplitByCharImpl::checkArguments(func, arguments); } + static constexpr auto strings_argument_position = 1uz; + static constexpr auto max_substrings_argument_position = std::make_optional(2); + void init(const ColumnsWithTypeAndName & arguments) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -376,19 +325,7 @@ public: throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " "Must be constant string.", arguments[0].column->getName(), getName()); - sep = col->getValue(); - } - - /// Returns the position of the argument that is the column of strings - static size_t getStringsArgumentPosition() - { - return 1; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return 2; + separator = col->getValue(); } /// Called for each next string. @@ -401,7 +338,7 @@ public: /// Get the next token, if any, or return false. bool get(Pos & token_begin, Pos & token_end) { - if (sep.empty()) + if (separator.empty()) { if (pos == end) return false; @@ -417,12 +354,12 @@ public: token_begin = pos; - pos = reinterpret_cast(memmem(pos, end - pos, sep.data(), sep.size())); + pos = reinterpret_cast(memmem(pos, end - pos, separator.data(), separator.size())); if (pos) { token_end = pos; - pos += sep.size(); + pos += separator.size(); } else token_end = end; @@ -448,13 +385,14 @@ public: static bool isVariadic() { return true; } static size_t getNumberOfArguments() { return 0; } - /// Check the type of function arguments. - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - SplitByStringImpl::checkArguments(arguments); + SplitByStringImpl::checkArguments(func, arguments); } - /// Initialize by the function arguments. + static constexpr auto strings_argument_position = 1uz; + static constexpr auto max_substrings_argument_position = std::make_optional(2); + void init(const ColumnsWithTypeAndName & arguments) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -467,18 +405,6 @@ public: re = std::make_shared(Regexps::createRegexp(col->getValue())); } - /// Returns the position of the argument that is the column of strings - static size_t getStringsArgumentPosition() - { - return 1; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return 2; - } - /// Called for each next string. void set(Pos pos_, Pos end_) { @@ -536,13 +462,19 @@ public: static bool isVariadic() { return false; } static size_t getNumberOfArguments() { return 2; } - /// Check the type of function arguments. - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - SplitByStringImpl::checkArguments(arguments); + FunctionArgumentDescriptors mandatory_args{ + {"haystack", &isString, nullptr, "String"}, + {"pattern", &isString, isColumnConst, "const String"} + }; + + validateFunctionArgumentTypes(func, arguments, mandatory_args); } - /// Initialize by the function arguments. + static constexpr auto strings_argument_position = 0uz; + static constexpr auto max_substrings_argument_position = std::make_optional(); + void init(const ColumnsWithTypeAndName & arguments) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); @@ -557,18 +489,6 @@ public: matches.resize(capture + 1); } - /// Returns the position of the argument that is the column of strings - static size_t getStringsArgumentPosition() - { - return 0; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return std::nullopt; - } - /// Called for each next string. void set(Pos pos_, Pos end_) { @@ -611,10 +531,7 @@ public: static constexpr auto name = Generator::name; static FunctionPtr create(ContextPtr) { return std::make_shared(); } - String getName() const override - { - return name; - } + String getName() const override { return name; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } @@ -622,18 +539,9 @@ public: size_t getNumberOfArguments() const override { return Generator::getNumberOfArguments(); } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - Generator::checkArguments(arguments); - - const auto max_substrings_pos = Generator::getMaxSubstringsArgumentPosition(); - if (max_substrings_pos && *max_substrings_pos < arguments.size() && !isNativeInteger(arguments[*max_substrings_pos])) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "{}-th argument for function '{}' must be integer, got '{}' instead", - *max_substrings_pos + 1, - getName(), - arguments[*max_substrings_pos]->getName()); + Generator::checkArguments(*this, arguments); return std::make_shared(std::make_shared()); } @@ -642,22 +550,24 @@ public: { Generator generator; generator.init(arguments); - const auto & array_argument = arguments[generator.getStringsArgumentPosition()]; + + const auto & array_argument = arguments[generator.strings_argument_position]; /// Whether we need to limit max tokens returned by Generator::get /// If max_substrings is std::nullopt, no limit is applied. auto max_substrings = getMaxSubstrings(arguments); const ColumnString * col_str = checkAndGetColumn(array_argument.column.get()); - const ColumnConst * col_const_str = - checkAndGetColumnConstStringOrFixedString(array_argument.column.get()); + const ColumnConst * col_str_const = checkAndGetColumnConstStringOrFixedString(array_argument.column.get()); auto col_res = ColumnArray::create(ColumnString::create()); + ColumnString & res_strings = typeid_cast(col_res->getData()); - ColumnArray::Offsets & res_offsets = col_res->getOffsets(); ColumnString::Chars & res_strings_chars = res_strings.getChars(); ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets(); + ColumnArray::Offsets & res_offsets = col_res->getOffsets(); + if (col_str) { const ColumnString::Chars & src_chars = col_str->getChars(); @@ -701,9 +611,9 @@ public: return col_res; } - else if (col_const_str) + else if (col_str_const) { - String src = col_const_str->getValue(); + String src = col_str_const->getValue(); Array dst; generator.set(src.data(), src.data() + src.size()); @@ -713,7 +623,7 @@ public: while (generator.get(token_begin, token_end) && !(max_substrings && dst.size() >= *max_substrings)) dst.push_back(String(token_begin, token_end - token_begin)); - return result_type->createColumnConst(col_const_str->size(), dst); + return result_type->createColumnConst(col_str_const->size(), dst); } else throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns {}, {} of arguments of function {}", @@ -734,7 +644,7 @@ private: std::optional getMaxSubstrings(const ColumnsWithTypeAndName & arguments) const { - const auto pos = Generator::getMaxSubstringsArgumentPosition(); + const auto pos = Generator::max_substrings_argument_position; if (!pos) return std::nullopt; @@ -758,7 +668,7 @@ private: if (max_substrings && *max_substrings <= 0) return std::nullopt; - return *max_substrings; + return max_substrings; } }; @@ -803,7 +713,7 @@ private: /// Loop through the rows within the array. /// NOTE You can do everything in one copy, if the separator has a size of 1. for (auto next_src_array_offset = src_array_offsets[i]; current_src_array_offset < next_src_array_offset; ++current_src_array_offset) { - if (unlikely(null_map && null_map[current_src_array_offset])) + if (null_map && null_map[current_src_array_offset]) [[unlikely]] continue; if (!first_non_null) @@ -881,7 +791,7 @@ public: bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } size_t getNumberOfArguments() const override { return 0; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override; + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override; ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override { diff --git a/src/Functions/URL/URLHierarchy.cpp b/src/Functions/URL/URLHierarchy.cpp index d3a45efb498..69819d2214f 100644 --- a/src/Functions/URL/URLHierarchy.cpp +++ b/src/Functions/URL/URLHierarchy.cpp @@ -23,27 +23,20 @@ public: static bool isVariadic() { return false; } static size_t getNumberOfArguments() { return 1; } - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - if (!isString(arguments[0])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. " - "Must be String.", arguments[0]->getName(), getName()); + FunctionArgumentDescriptors mandatory_args{ + {"URL", &isString, nullptr, "String"}, + }; + + validateFunctionArgumentTypes(func, arguments, mandatory_args); } + static constexpr auto strings_argument_position = 0uz; + static constexpr auto max_substrings_argument_position = std::make_optional(); + void init(const ColumnsWithTypeAndName & /*arguments*/) {} - /// Returns the position of the argument that is the column of rows - static size_t getStringsArgumentPosition() - { - return 0; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return std::nullopt; - } - /// Called for each next string. void set(Pos pos_, Pos end_) { diff --git a/src/Functions/URL/URLPathHierarchy.cpp b/src/Functions/URL/URLPathHierarchy.cpp index 3775748f6ed..2c4f4e9be5c 100644 --- a/src/Functions/URL/URLPathHierarchy.cpp +++ b/src/Functions/URL/URLPathHierarchy.cpp @@ -22,27 +22,20 @@ public: static bool isVariadic() { return false; } static size_t getNumberOfArguments() { return 1; } - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - if (!isString(arguments[0])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. " - "Must be String.", arguments[0]->getName(), getName()); + FunctionArgumentDescriptors mandatory_args{ + {"URL", &isString, nullptr, "String"}, + }; + + validateFunctionArgumentTypes(func, arguments, mandatory_args); } + static constexpr auto strings_argument_position = 0uz; + static constexpr auto max_substrings_argument_position = std::make_optional(); + void init(const ColumnsWithTypeAndName & /*arguments*/) {} - /// Returns the position of the argument that is the column of rows - static size_t getStringsArgumentPosition() - { - return 0; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return std::nullopt; - } - /// Called for each next string. void set(Pos pos_, Pos end_) { diff --git a/src/Functions/URL/extractURLParameterNames.cpp b/src/Functions/URL/extractURLParameterNames.cpp index 4ca2d79d22d..0e9153acf7f 100644 --- a/src/Functions/URL/extractURLParameterNames.cpp +++ b/src/Functions/URL/extractURLParameterNames.cpp @@ -22,25 +22,17 @@ public: static bool isVariadic() { return false; } static size_t getNumberOfArguments() { return 1; } - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - if (!isString(arguments[0])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. " - "Must be String.", arguments[0]->getName(), getName()); - } - - /// Returns the position of the argument that is the column of rows - static size_t getStringsArgumentPosition() - { - return 0; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return std::nullopt; + FunctionArgumentDescriptors mandatory_args{ + {"URL", &isString, nullptr, "String"}, + }; + + validateFunctionArgumentTypes(func, arguments, mandatory_args); } + static constexpr auto strings_argument_position = 0uz; + static constexpr auto max_substrings_argument_position = std::make_optional(); void init(const ColumnsWithTypeAndName & /*arguments*/) {} diff --git a/src/Functions/URL/extractURLParameters.cpp b/src/Functions/URL/extractURLParameters.cpp index a44157e1b35..273edde8d18 100644 --- a/src/Functions/URL/extractURLParameters.cpp +++ b/src/Functions/URL/extractURLParameters.cpp @@ -22,26 +22,19 @@ public: static bool isVariadic() { return false; } static size_t getNumberOfArguments() { return 1; } - static void checkArguments(const DataTypes & arguments) + static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { - if (!isString(arguments[0])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. " - "Must be String.", arguments[0]->getName(), getName()); + FunctionArgumentDescriptors mandatory_args{ + {"URL", &isString, nullptr, "String"}, + }; + + validateFunctionArgumentTypes(func, arguments, mandatory_args); } void init(const ColumnsWithTypeAndName & /*arguments*/) {} - /// Returns the position of the argument that is the column of rows - static size_t getStringsArgumentPosition() - { - return 0; - } - - /// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function. - static std::optional getMaxSubstringsArgumentPosition() - { - return std::nullopt; - } + static constexpr auto strings_argument_position = 0uz; + static constexpr auto max_substrings_argument_position = std::make_optional(); /// Called for each next string. void set(Pos pos_, Pos end_) From b5b2cc511b7171332a368895e32589d573b46185 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 11 Sep 2023 18:48:40 +0000 Subject: [PATCH 2/7] Pythonic vs Spark splitting --- docs/en/operations/settings/settings.md | 11 + .../functions/splitting-merging-functions.md | 12 + src/Core/Settings.h | 1 + src/Functions/FunctionsStringArray.cpp | 34 +++ src/Functions/FunctionsStringArray.h | 269 +++++++++++++----- src/Functions/URL/URLHierarchy.cpp | 3 +- src/Functions/URL/URLPathHierarchy.cpp | 3 +- .../URL/extractURLParameterNames.cpp | 3 +- src/Functions/URL/extractURLParameters.cpp | 3 +- .../02876_splitby_spark_vs_python.reference | 22 ++ .../02876_splitby_spark_vs_python.sql | 27 ++ 11 files changed, 314 insertions(+), 74 deletions(-) create mode 100644 tests/queries/0_stateless/02876_splitby_spark_vs_python.reference create mode 100644 tests/queries/0_stateless/02876_splitby_spark_vs_python.sql diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index e9e5920fa59..227483758d5 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4067,6 +4067,17 @@ Result: └─────┴─────┴───────┘ ``` +## split_tokens_like_python {#split-tokens-like-python} + +Controls if functions [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with `max_substring` argument > 0 include the remaining string (if any) in the result array (Python semantics) or not (Spark semantics). + +Possible values: + +- 0 - Don't include the remaining string (Spark semantics). +- 1 - Include the remaining string (Python semantics). + +Default value: `0`. + ## enable_extended_results_for_datetime_functions {#enable-extended-results-for-datetime-functions} Enables or disables returning results of type: diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index c88643ef7cf..7e788a8e45b 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -38,6 +38,8 @@ The behavior of parameter `max_substrings` changed starting with ClickHouse v22. For example, - in v22.10: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b','c=d']` - in v22.11: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b']` + +The previous behavior can be restored by setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) = 1. ::: **Example** @@ -80,6 +82,8 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere - There are multiple consecutive non-empty separators; - The original string `s` is empty while the separator is not empty. +Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. + **Example** ``` sql @@ -133,6 +137,8 @@ Returns an array of selected substrings. Empty substrings may be selected when: Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. + **Example** ``` sql @@ -182,6 +188,8 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. + **Example** ``` sql @@ -219,6 +227,8 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. + **Example** ``` sql @@ -279,6 +289,8 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. + **Example** ``` sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index e8430e96115..14e99918983 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -502,6 +502,7 @@ class IColumn; M(Bool, reject_expensive_hyperscan_regexps, true, "Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion)", 0) \ M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \ M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \ + M(Bool, split_tokens_like_python, false, "If true, then functions splitBy*() with given max_substring argument include remaining string in the result (Python semantics) or not (Spark semantics).", 0) \ \ M(Bool, allow_execute_multiif_columnar, true, "Allow execute multiIf function columnar", 0) \ M(Bool, formatdatetime_f_prints_single_zero, false, "Formatter '%f' in function 'formatDateTime()' produces a single zero instead of six zeros if the formatted value has no fractional seconds.", 0) \ diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp index 4d118481bb2..51b50d793e9 100644 --- a/src/Functions/FunctionsStringArray.cpp +++ b/src/Functions/FunctionsStringArray.cpp @@ -9,6 +9,40 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } +template +std::optional extractMaxSplitsImpl(const ColumnWithTypeAndName & argument) +{ + const auto * col = checkAndGetColumnConst>(argument.column.get()); + if (!col) + return std::nullopt; + + auto value = col->template getValue(); + return static_cast(value); +} + +std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position) +{ + if (max_substrings_argument_position >= arguments.size()) + return std::nullopt; + + std::optional max_splits; + if (!((max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) + || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) + || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) + || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])))) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {}, which is {}-th argument",// of function {}", + arguments[max_substrings_argument_position].column->getName(), + max_substrings_argument_position + 1);//, + /// getName()); + + if (max_splits && *max_splits <= 0) + return std::nullopt; + + return max_splits; +} + DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const { FunctionArgumentDescriptors mandatory_args{ diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index ce78090dc6b..92eb015e6e3 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -56,6 +56,13 @@ namespace ErrorCodes using Pos = const char *; +std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position); + +enum class SplitTokenMode +{ + LikeSpark, + LikePython +}; /// Substring generators. All of them have a common interface. @@ -64,6 +71,9 @@ class SplitByAlphaImpl private: Pos pos; Pos end; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; public: static constexpr auto name = "alphaTokens"; @@ -87,15 +97,19 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(1); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + { + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 1); + } /// Called for each next string. void set(Pos pos_, Pos end_) { pos = pos_; end = end_; + splits = 0; } /// Get the next token, if any, or return false. @@ -110,10 +124,26 @@ public: token_begin = pos; + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = end; + return true; + } + } + } + while (pos < end && isAlphaASCII(*pos)) ++pos; token_end = pos; + ++splits; return true; } @@ -124,6 +154,9 @@ class SplitByNonAlphaImpl private: Pos pos; Pos end; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; public: /// Get the name of the function. @@ -139,15 +172,19 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(1); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + { + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 1); + } /// Called for each next string. void set(Pos pos_, Pos end_) { pos = pos_; end = end_; + splits = 0; } /// Get the next token, if any, or return false. @@ -162,10 +199,25 @@ public: token_begin = pos; + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = end; + return true; + } + } + } while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) ++pos; token_end = pos; + splits++; return true; } @@ -176,6 +228,9 @@ class SplitByWhitespaceImpl private: Pos pos; Pos end; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; public: static constexpr auto name = "splitByWhitespace"; @@ -190,15 +245,19 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(1); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + { + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 1); + } /// Called for each next string. void set(Pos pos_, Pos end_) { pos = pos_; end = end_; + splits = 0; } /// Get the next token, if any, or return false. @@ -213,10 +272,26 @@ public: token_begin = pos; + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = end; + return true; + } + } + } + while (pos < end && !isWhitespaceASCII(*pos)) ++pos; token_end = pos; + splits++; return true; } @@ -228,6 +303,9 @@ private: Pos pos; Pos end; char separator; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; public: static constexpr auto name = "splitByChar"; @@ -250,9 +328,8 @@ public: } static constexpr auto strings_argument_position = 1uz; - static constexpr auto max_substrings_argument_position = std::make_optional(2); - void init(const ColumnsWithTypeAndName & arguments) + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -266,12 +343,16 @@ public: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal separator for function {}. Must be exactly one byte.", getName()); separator = sep_str[0]; + + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 2); } void set(Pos pos_, Pos end_) { pos = pos_; end = end_; + splits = 0; } bool get(Pos & token_begin, Pos & token_end) @@ -280,12 +361,28 @@ public: return false; token_begin = pos; - pos = reinterpret_cast(memchr(pos, separator, end - pos)); + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = nullptr; + return true; + } + } + } + + pos = reinterpret_cast(memchr(pos, separator, end - pos)); if (pos) { token_end = pos; ++pos; + ++splits; } else token_end = end; @@ -300,8 +397,10 @@ class SplitByStringImpl private: Pos pos; Pos end; - String separator; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; public: static constexpr auto name = "splitByString"; @@ -315,9 +414,8 @@ public: } static constexpr auto strings_argument_position = 1uz; - static constexpr auto max_substrings_argument_position = std::make_optional(2); - void init(const ColumnsWithTypeAndName & arguments) + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -326,6 +424,9 @@ public: "Must be constant string.", arguments[0].column->getName(), getName()); separator = col->getValue(); + + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 2); } /// Called for each next string. @@ -333,6 +434,7 @@ public: { pos = pos_; end = end_; + splits = 0; } /// Get the next token, if any, or return false. @@ -344,8 +446,25 @@ public: return false; token_begin = pos; + + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = end; + return true; + } + } + } + pos += 1; token_end = pos; + ++splits; } else { @@ -354,8 +473,22 @@ public: token_begin = pos; - pos = reinterpret_cast(memmem(pos, end - pos, separator.data(), separator.size())); + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = nullptr; + return true; + } + } + } + pos = reinterpret_cast(memmem(pos, end - pos, separator.data(), separator.size())); if (pos) { token_end = pos; @@ -363,6 +496,7 @@ public: } else token_end = end; + ++splits; } return true; @@ -378,6 +512,10 @@ private: Pos pos; Pos end; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; + public: static constexpr auto name = "splitByRegexp"; static String getName() { return name; } @@ -391,9 +529,8 @@ public: } static constexpr auto strings_argument_position = 1uz; - static constexpr auto max_substrings_argument_position = std::make_optional(2); - void init(const ColumnsWithTypeAndName & arguments) + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -403,6 +540,9 @@ public: if (!col->getValue().empty()) re = std::make_shared(Regexps::createRegexp(col->getValue())); + + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 2); } /// Called for each next string. @@ -410,6 +550,7 @@ public: { pos = pos_; end = end_; + splits = 0; } /// Get the next token, if any, or return false. @@ -421,8 +562,25 @@ public: return false; token_begin = pos; + + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = end; + return true; + } + } + } + pos += 1; token_end = pos; + ++splits; } else { @@ -431,6 +589,21 @@ public: token_begin = pos; + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = nullptr; + return true; + } + } + } + if (!re->match(pos, end - pos, matches) || !matches[0].length) { token_end = end; @@ -441,6 +614,7 @@ public: token_end = pos + matches[0].offset; pos = token_end + matches[0].length; } + ++splits; } return true; @@ -473,9 +647,8 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(); - void init(const ColumnsWithTypeAndName & arguments) + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode /*split_token_mode*/) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); @@ -527,9 +700,18 @@ public: template class FunctionTokens : public IFunction { +private: + SplitTokenMode split_token_mode; + public: static constexpr auto name = Generator::name; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + + explicit FunctionTokens(ContextPtr context) + { + const Settings & settings = context->getSettingsRef(); + split_token_mode = settings.split_tokens_like_python ? SplitTokenMode::LikePython : SplitTokenMode::LikeSpark; + } String getName() const override { return name; } @@ -549,14 +731,10 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override { Generator generator; - generator.init(arguments); + generator.init(arguments, split_token_mode); const auto & array_argument = arguments[generator.strings_argument_position]; - /// Whether we need to limit max tokens returned by Generator::get - /// If max_substrings is std::nullopt, no limit is applied. - auto max_substrings = getMaxSubstrings(arguments); - const ColumnString * col_str = checkAndGetColumn(array_argument.column.get()); const ColumnConst * col_str_const = checkAndGetColumnConstStringOrFixedString(array_argument.column.get()); @@ -592,7 +770,7 @@ public: generator.set(pos, end); size_t j = 0; - while (generator.get(token_begin, token_end) && !(max_substrings && j >= *max_substrings)) + while (generator.get(token_begin, token_end)) { size_t token_size = token_end - token_begin; @@ -620,7 +798,7 @@ public: Pos token_begin = nullptr; Pos token_end = nullptr; - while (generator.get(token_begin, token_end) && !(max_substrings && dst.size() >= *max_substrings)) + while (generator.get(token_begin, token_end)) dst.push_back(String(token_begin, token_end - token_begin)); return result_type->createColumnConst(col_str_const->size(), dst); @@ -629,47 +807,6 @@ public: throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns {}, {} of arguments of function {}", array_argument.column->getName(), array_argument.column->getName(), getName()); } - -private: - template - std::optional getMaxSubstringsImpl(const ColumnWithTypeAndName & argument) const - { - const auto * col = checkAndGetColumnConst>(argument.column.get()); - if (!col) - return {}; - - auto value = col->template getValue(); - return static_cast(value); - } - - std::optional getMaxSubstrings(const ColumnsWithTypeAndName & arguments) const - { - const auto pos = Generator::max_substrings_argument_position; - if (!pos) - return std::nullopt; - - if (*pos >= arguments.size()) - return std::nullopt; - - std::optional max_substrings; - if (!((max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) - || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) - || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) - || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])))) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {}, which is {}-th argument of function {}", - arguments[*pos].column->getName(), - *pos + 1, - getName()); - - /// If max_substrings is negative or zero, tokenize will be applied as many times as possible, which is equivalent to - /// no max_substrings argument in function - if (max_substrings && *max_substrings <= 0) - return std::nullopt; - - return max_substrings; - } }; diff --git a/src/Functions/URL/URLHierarchy.cpp b/src/Functions/URL/URLHierarchy.cpp index 69819d2214f..bce876f735f 100644 --- a/src/Functions/URL/URLHierarchy.cpp +++ b/src/Functions/URL/URLHierarchy.cpp @@ -33,9 +33,8 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/URLPathHierarchy.cpp b/src/Functions/URL/URLPathHierarchy.cpp index 2c4f4e9be5c..b44144a5358 100644 --- a/src/Functions/URL/URLPathHierarchy.cpp +++ b/src/Functions/URL/URLPathHierarchy.cpp @@ -32,9 +32,8 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/extractURLParameterNames.cpp b/src/Functions/URL/extractURLParameterNames.cpp index 0e9153acf7f..785ed050d15 100644 --- a/src/Functions/URL/extractURLParameterNames.cpp +++ b/src/Functions/URL/extractURLParameterNames.cpp @@ -32,9 +32,8 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/extractURLParameters.cpp b/src/Functions/URL/extractURLParameters.cpp index 273edde8d18..c21ced2a3aa 100644 --- a/src/Functions/URL/extractURLParameters.cpp +++ b/src/Functions/URL/extractURLParameters.cpp @@ -31,10 +31,9 @@ public: validateFunctionArgumentTypes(func, arguments, mandatory_args); } - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(); /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/tests/queries/0_stateless/02876_splitby_spark_vs_python.reference b/tests/queries/0_stateless/02876_splitby_spark_vs_python.reference new file mode 100644 index 00000000000..0c73fd7de76 --- /dev/null +++ b/tests/queries/0_stateless/02876_splitby_spark_vs_python.reference @@ -0,0 +1,22 @@ +splitByAlpha +['ab','cd'] +['ab','cd','ef.gh'] +splitByNonAlpha +['128','0'] +['128','0','0.1'] +splitByWhitespace +['Nein,','nein,'] +['Nein,','nein,','nein! Doch!'] +splitByChar +['a','b'] +['a','b','c=d'] +splitByString +['a','='] +['a','=','=b==c==d'] +['a','b'] +['a','b','c==d'] +splitByRegexp +['a','1'] +['a','1','2bc23de345f'] +['a','bc'] +['a','bc','de345f'] diff --git a/tests/queries/0_stateless/02876_splitby_spark_vs_python.sql b/tests/queries/0_stateless/02876_splitby_spark_vs_python.sql new file mode 100644 index 00000000000..c550f69bd0c --- /dev/null +++ b/tests/queries/0_stateless/02876_splitby_spark_vs_python.sql @@ -0,0 +1,27 @@ +SELECT 'splitByAlpha'; +SELECT splitByAlpha('ab.cd.ef.gh', 2) settings split_tokens_like_python = 0; +SELECT splitByAlpha('ab.cd.ef.gh', 2) settings split_tokens_like_python = 1; + +SELECT 'splitByNonAlpha'; +SELECT splitByNonAlpha('128.0.0.1', 2) settings split_tokens_like_python = 0; +SELECT splitByNonAlpha('128.0.0.1', 2) settings split_tokens_like_python = 1; + +SELECT 'splitByWhitespace'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) settings split_tokens_like_python = 0; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) settings split_tokens_like_python = 1; + +SELECT 'splitByChar'; +SELECT splitByChar('=', 'a=b=c=d', 2) SETTINGS split_tokens_like_python = 0; +SELECT splitByChar('=', 'a=b=c=d', 2) SETTINGS split_tokens_like_python = 1; + +SELECT 'splitByString'; +SELECT splitByString('', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 0; +SELECT splitByString('', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 1; +SELECT splitByString('==', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 0; +SELECT splitByString('==', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 1; + +SELECT 'splitByRegexp'; +SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 0; +SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 1; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 0; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 1; From 2a5aa289e85707c8fa8b8363a3fdc7fe3b1c0ba4 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 13 Sep 2023 12:03:17 +0000 Subject: [PATCH 3/7] Implement true Python/Spark split behavior --- docs/en/operations/settings/settings.md | 11 +- .../functions/splitting-merging-functions.md | 12 +- src/Core/Settings.h | 2 +- src/Functions/FunctionsStringArray.cpp | 27 +- src/Functions/FunctionsStringArray.h | 382 ++++++++++++------ src/Functions/URL/URLHierarchy.cpp | 2 +- src/Functions/URL/URLPathHierarchy.cpp | 2 +- .../URL/extractURLParameterNames.cpp | 2 +- src/Functions/URL/extractURLParameters.cpp | 2 +- ...6_splitby_max_substring_behavior.reference | 126 ++++++ .../02876_splitby_max_substring_behavior.sql | 151 +++++++ .../02876_splitby_spark_vs_python.reference | 22 - .../02876_splitby_spark_vs_python.sql | 27 -- 13 files changed, 584 insertions(+), 184 deletions(-) create mode 100644 tests/queries/0_stateless/02876_splitby_max_substring_behavior.reference create mode 100644 tests/queries/0_stateless/02876_splitby_max_substring_behavior.sql delete mode 100644 tests/queries/0_stateless/02876_splitby_spark_vs_python.reference delete mode 100644 tests/queries/0_stateless/02876_splitby_spark_vs_python.sql diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 227483758d5..ad1437ea3eb 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4067,16 +4067,17 @@ Result: └─────┴─────┴───────┘ ``` -## split_tokens_like_python {#split-tokens-like-python} +## splitby_max_substring_behavior {#splitby-max-substring-behavior} -Controls if functions [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with `max_substring` argument > 0 include the remaining string (if any) in the result array (Python semantics) or not (Spark semantics). +Controls how functions [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with given `max_substring` argument behave. Possible values: -- 0 - Don't include the remaining string (Spark semantics). -- 1 - Include the remaining string (Python semantics). +- `''` - If `max_substring` >=1, return the first `max_substring`-many splits. +- `'python'` - If `max_substring` >= 0, split `max_substring`-many times, and return `max_substring + 1` elements where the last element contains the remaining string. +- `'spark'` - If `max_substring` >= 1, split `max_substring`-many times, and return `max_substring + 1` elements where the last element contains the remaining string. -Default value: `0`. +Default value: ``. ## enable_extended_results_for_datetime_functions {#enable-extended-results-for-datetime-functions} diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index 7e788a8e45b..1e0bc3da664 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -39,7 +39,7 @@ For example, - in v22.10: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b','c=d']` - in v22.11: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b']` -The previous behavior can be restored by setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) = 1. +The previous behavior can be restored by setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) = 'python'. ::: **Example** @@ -82,7 +82,7 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere - There are multiple consecutive non-empty separators; - The original string `s` is empty while the separator is not empty. -Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. +Setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) (default: '') controls the behavior with `max_substrings` > 0. **Example** @@ -137,7 +137,7 @@ Returns an array of selected substrings. Empty substrings may be selected when: Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. +Setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) (default: '') controls the behavior with `max_substrings` > 0. **Example** @@ -188,7 +188,7 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. +Setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) (default: '') controls the behavior with `max_substrings` > 0. **Example** @@ -227,7 +227,7 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. +Setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) (default: '') controls the behavior with `max_substrings` > 0. **Example** @@ -289,7 +289,7 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. +Setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) (default: '') controls the behavior with `max_substrings` > 0. **Example** diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 14e99918983..ca8f82ed8b6 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -502,7 +502,7 @@ class IColumn; M(Bool, reject_expensive_hyperscan_regexps, true, "Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion)", 0) \ M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \ M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \ - M(Bool, split_tokens_like_python, false, "If true, then functions splitBy*() with given max_substring argument include remaining string in the result (Python semantics) or not (Spark semantics).", 0) \ + M(String, splitby_max_substring_behavior, "", "Control the behavior of the 'max_substring' argument in functions splitBy*(): '' (default), 'python' or 'spark'", 0) \ \ M(Bool, allow_execute_multiif_columnar, true, "Allow execute multiIf function columnar", 0) \ M(Bool, formatdatetime_f_prints_single_zero, false, "Formatter '%f' in function 'formatDateTime()' produces a single zero instead of six zeros if the formatted value has no fractional seconds.", 0) \ diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp index 51b50d793e9..085cb2c8eae 100644 --- a/src/Functions/FunctionsStringArray.cpp +++ b/src/Functions/FunctionsStringArray.cpp @@ -20,7 +20,7 @@ std::optional extractMaxSplitsImpl(const ColumnWithTypeAndName & argument return static_cast(value); } -std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position) +std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position, MaxSubstringBehavior max_substring_behavior) { if (max_substrings_argument_position >= arguments.size()) return std::nullopt; @@ -32,13 +32,28 @@ std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])))) throw Exception( ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {}, which is {}-th argument",// of function {}", + "Illegal column {}, which is {}-th argument", arguments[max_substrings_argument_position].column->getName(), - max_substrings_argument_position + 1);//, - /// getName()); + max_substrings_argument_position + 1); + + if (max_splits) + switch (max_substring_behavior) + { + case MaxSubstringBehavior::LikeClickHouse: + case MaxSubstringBehavior::LikeSpark: + { + if (*max_splits <= 0) + return std::nullopt; + break; + } + case MaxSubstringBehavior::LikePython: + { + if (*max_splits < 0) + return std::nullopt; + break; + } + } - if (max_splits && *max_splits <= 0) - return std::nullopt; return max_splits; } diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 92eb015e6e3..8b732292a1c 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -56,14 +56,15 @@ namespace ErrorCodes using Pos = const char *; -std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position); - -enum class SplitTokenMode +enum class MaxSubstringBehavior { + LikeClickHouse, LikeSpark, LikePython }; +std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position, MaxSubstringBehavior max_substring_behavior); + /// Substring generators. All of them have a common interface. class SplitByAlphaImpl @@ -73,7 +74,7 @@ private: Pos end; std::optional max_splits; size_t splits; - SplitTokenMode split_token_mode; + MaxSubstringBehavior max_substring_behavior; public: static constexpr auto name = "alphaTokens"; @@ -98,10 +99,10 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) { - split_token_mode = split_token_mode_; - max_splits = extractMaxSplits(arguments, 1); + max_substring_behavior = max_substring_behavior_; + max_splits = extractMaxSplits(arguments, 1, max_substring_behavior); } /// Called for each next string. @@ -124,18 +125,36 @@ public: token_begin = pos; - if (max_splits && splits >= max_splits) + if (max_splits) { - switch (split_token_mode) + switch (max_substring_behavior) { - case SplitTokenMode::LikeSpark: - return false; - case SplitTokenMode::LikePython: - { - token_end = end; - pos = end; - return true; - } + case MaxSubstringBehavior::LikeClickHouse: + { + if (splits == *max_splits) + return false; + break; + } + case MaxSubstringBehavior::LikeSpark: + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + break; + } + case MaxSubstringBehavior::LikePython: + { + if (splits == *max_splits) + { + token_end = end; + pos = end; + return true; + } + break; + } } } @@ -156,7 +175,7 @@ private: Pos end; std::optional max_splits; size_t splits; - SplitTokenMode split_token_mode; + MaxSubstringBehavior max_substring_behavior; public: /// Get the name of the function. @@ -173,10 +192,10 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) { - split_token_mode = split_token_mode_; - max_splits = extractMaxSplits(arguments, 1); + max_substring_behavior = max_substring_behavior_; + max_splits = extractMaxSplits(arguments, 1, max_substring_behavior); } /// Called for each next string. @@ -199,20 +218,39 @@ public: token_begin = pos; - if (max_splits && splits >= max_splits) + if (max_splits) { - switch (split_token_mode) + switch (max_substring_behavior) { - case SplitTokenMode::LikeSpark: - return false; - case SplitTokenMode::LikePython: - { - token_end = end; - pos = end; - return true; - } + case MaxSubstringBehavior::LikeClickHouse: + { + if (splits == *max_splits) + return false; + break; + } + case MaxSubstringBehavior::LikeSpark: + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + break; + } + case MaxSubstringBehavior::LikePython: + { + if (splits == *max_splits) + { + token_end = end; + pos = end; + return true; + } + break; + } } } + while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) ++pos; @@ -230,7 +268,7 @@ private: Pos end; std::optional max_splits; size_t splits; - SplitTokenMode split_token_mode; + MaxSubstringBehavior max_substring_behavior; public: static constexpr auto name = "splitByWhitespace"; @@ -246,10 +284,10 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) { - split_token_mode = split_token_mode_; - max_splits = extractMaxSplits(arguments, 1); + max_substring_behavior = max_substring_behavior_; + max_splits = extractMaxSplits(arguments, 1, max_substring_behavior); } /// Called for each next string. @@ -272,18 +310,36 @@ public: token_begin = pos; - if (max_splits && splits >= max_splits) + if (max_splits) { - switch (split_token_mode) + switch (max_substring_behavior) { - case SplitTokenMode::LikeSpark: - return false; - case SplitTokenMode::LikePython: - { - token_end = end; - pos = end; - return true; - } + case MaxSubstringBehavior::LikeClickHouse: + { + if (splits == *max_splits) + return false; + break; + } + case MaxSubstringBehavior::LikeSpark: + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + break; + } + case MaxSubstringBehavior::LikePython: + { + if (splits == *max_splits) + { + token_end = end; + pos = end; + return true; + } + break; + } } } @@ -305,7 +361,7 @@ private: char separator; std::optional max_splits; size_t splits; - SplitTokenMode split_token_mode; + MaxSubstringBehavior max_substring_behavior; public: static constexpr auto name = "splitByChar"; @@ -329,7 +385,7 @@ public: static constexpr auto strings_argument_position = 1uz; - void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -344,8 +400,8 @@ public: separator = sep_str[0]; - split_token_mode = split_token_mode_; - max_splits = extractMaxSplits(arguments, 2); + max_substring_behavior = max_substring_behavior_; + max_splits = extractMaxSplits(arguments, 2, max_substring_behavior); } void set(Pos pos_, Pos end_) @@ -362,18 +418,36 @@ public: token_begin = pos; - if (max_splits && splits >= max_splits) + if (max_splits) { - switch (split_token_mode) + switch (max_substring_behavior) { - case SplitTokenMode::LikeSpark: - return false; - case SplitTokenMode::LikePython: - { - token_end = end; - pos = nullptr; - return true; - } + case MaxSubstringBehavior::LikeClickHouse: + { + if (splits == *max_splits) + return false; + break; + } + case MaxSubstringBehavior::LikeSpark: + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = nullptr; + return true; + } + break; + } + case MaxSubstringBehavior::LikePython: + { + if (splits == *max_splits) + { + token_end = end; + pos = nullptr; + return true; + } + break; + } } } @@ -400,7 +474,7 @@ private: String separator; std::optional max_splits; size_t splits; - SplitTokenMode split_token_mode; + MaxSubstringBehavior max_substring_behavior; public: static constexpr auto name = "splitByString"; @@ -415,7 +489,7 @@ public: static constexpr auto strings_argument_position = 1uz; - void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -425,8 +499,8 @@ public: separator = col->getValue(); - split_token_mode = split_token_mode_; - max_splits = extractMaxSplits(arguments, 2); + max_substring_behavior = max_substring_behavior_; + max_splits = extractMaxSplits(arguments, 2, max_substring_behavior); } /// Called for each next string. @@ -447,18 +521,36 @@ public: token_begin = pos; - if (max_splits && splits >= max_splits) + if (max_splits) { - switch (split_token_mode) + switch (max_substring_behavior) { - case SplitTokenMode::LikeSpark: - return false; - case SplitTokenMode::LikePython: - { - token_end = end; - pos = end; - return true; - } + case MaxSubstringBehavior::LikeClickHouse: + { + if (splits == *max_splits) + return false; + break; + } + case MaxSubstringBehavior::LikeSpark: + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + break; + } + case MaxSubstringBehavior::LikePython: + { + if (splits == *max_splits) + { + token_end = end; + pos = end; + return true; + } + break; + } } } @@ -473,18 +565,36 @@ public: token_begin = pos; - if (max_splits && splits >= max_splits) + if (max_splits) { - switch (split_token_mode) + switch (max_substring_behavior) { - case SplitTokenMode::LikeSpark: - return false; - case SplitTokenMode::LikePython: - { - token_end = end; - pos = nullptr; - return true; - } + case MaxSubstringBehavior::LikeClickHouse: + { + if (splits == *max_splits) + return false; + break; + } + case MaxSubstringBehavior::LikeSpark: + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = nullptr; + return true; + } + break; + } + case MaxSubstringBehavior::LikePython: + { + if (splits == *max_splits) + { + token_end = end; + pos = nullptr; + return true; + } + break; + } } } @@ -493,10 +603,10 @@ public: { token_end = pos; pos += separator.size(); + ++splits; } else token_end = end; - ++splits; } return true; @@ -514,7 +624,7 @@ private: std::optional max_splits; size_t splits; - SplitTokenMode split_token_mode; + MaxSubstringBehavior max_substring_behavior; public: static constexpr auto name = "splitByRegexp"; @@ -530,7 +640,7 @@ public: static constexpr auto strings_argument_position = 1uz; - void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -541,8 +651,8 @@ public: if (!col->getValue().empty()) re = std::make_shared(Regexps::createRegexp(col->getValue())); - split_token_mode = split_token_mode_; - max_splits = extractMaxSplits(arguments, 2); + max_substring_behavior = max_substring_behavior_; + max_splits = extractMaxSplits(arguments, 2, max_substring_behavior); } /// Called for each next string. @@ -563,18 +673,36 @@ public: token_begin = pos; - if (max_splits && splits >= max_splits) + if (max_splits) { - switch (split_token_mode) + switch (max_substring_behavior) { - case SplitTokenMode::LikeSpark: - return false; - case SplitTokenMode::LikePython: - { - token_end = end; - pos = end; - return true; - } + case MaxSubstringBehavior::LikeClickHouse: + { + if (splits == *max_splits) + return false; + break; + } + case MaxSubstringBehavior::LikeSpark: + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = end; + return true; + } + break; + } + case MaxSubstringBehavior::LikePython: + { + if (splits == *max_splits) + { + token_end = end; + pos = end; + return true; + } + break; + } } } @@ -589,18 +717,36 @@ public: token_begin = pos; - if (max_splits && splits >= max_splits) + if (max_splits) { - switch (split_token_mode) + switch (max_substring_behavior) { - case SplitTokenMode::LikeSpark: - return false; - case SplitTokenMode::LikePython: - { - token_end = end; - pos = nullptr; - return true; - } + case MaxSubstringBehavior::LikeClickHouse: + { + if (splits == *max_splits) + return false; + break; + } + case MaxSubstringBehavior::LikeSpark: + { + if (splits == *max_splits - 1) + { + token_end = end; + pos = nullptr; + return true; + } + break; + } + case MaxSubstringBehavior::LikePython: + { + if (splits == *max_splits) + { + token_end = end; + pos = nullptr; + return true; + } + break; + } } } @@ -613,8 +759,8 @@ public: { token_end = pos + matches[0].offset; pos = token_end + matches[0].length; + ++splits; } - ++splits; } return true; @@ -648,7 +794,7 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode /*split_token_mode*/) + void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior /*max_substring_behavior*/) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); @@ -701,7 +847,7 @@ template class FunctionTokens : public IFunction { private: - SplitTokenMode split_token_mode; + MaxSubstringBehavior max_substring_behavior; public: static constexpr auto name = Generator::name; @@ -710,7 +856,17 @@ public: explicit FunctionTokens(ContextPtr context) { const Settings & settings = context->getSettingsRef(); - split_token_mode = settings.split_tokens_like_python ? SplitTokenMode::LikePython : SplitTokenMode::LikeSpark; + if (settings.splitby_max_substring_behavior.value == "") + max_substring_behavior = MaxSubstringBehavior::LikeClickHouse; + else if (settings.splitby_max_substring_behavior.value == "python") + max_substring_behavior = MaxSubstringBehavior::LikePython; + else if (settings.splitby_max_substring_behavior.value == "spark") + max_substring_behavior = MaxSubstringBehavior::LikeSpark; + else + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal value {} for setting splitby_max_substring_behavior in function {}, must be '', 'python' or 'spark'", + settings.splitby_max_substring_behavior.value, getName()); } String getName() const override { return name; } @@ -731,7 +887,7 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override { Generator generator; - generator.init(arguments, split_token_mode); + generator.init(arguments, max_substring_behavior); const auto & array_argument = arguments[generator.strings_argument_position]; diff --git a/src/Functions/URL/URLHierarchy.cpp b/src/Functions/URL/URLHierarchy.cpp index bce876f735f..5bd5629992f 100644 --- a/src/Functions/URL/URLHierarchy.cpp +++ b/src/Functions/URL/URLHierarchy.cpp @@ -34,7 +34,7 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, MaxSubstringBehavior /*max_substring_behavior*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/URLPathHierarchy.cpp b/src/Functions/URL/URLPathHierarchy.cpp index b44144a5358..714f56ece36 100644 --- a/src/Functions/URL/URLPathHierarchy.cpp +++ b/src/Functions/URL/URLPathHierarchy.cpp @@ -33,7 +33,7 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, MaxSubstringBehavior /*max_substring_behavior*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/extractURLParameterNames.cpp b/src/Functions/URL/extractURLParameterNames.cpp index 785ed050d15..3d40013335a 100644 --- a/src/Functions/URL/extractURLParameterNames.cpp +++ b/src/Functions/URL/extractURLParameterNames.cpp @@ -33,7 +33,7 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, MaxSubstringBehavior /*max_substring_behavior*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/extractURLParameters.cpp b/src/Functions/URL/extractURLParameters.cpp index c21ced2a3aa..82df7888196 100644 --- a/src/Functions/URL/extractURLParameters.cpp +++ b/src/Functions/URL/extractURLParameters.cpp @@ -31,7 +31,7 @@ public: validateFunctionArgumentTypes(func, arguments, mandatory_args); } - void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, MaxSubstringBehavior /*max_substring_behavior*/) {} static constexpr auto strings_argument_position = 0uz; diff --git a/tests/queries/0_stateless/02876_splitby_max_substring_behavior.reference b/tests/queries/0_stateless/02876_splitby_max_substring_behavior.reference new file mode 100644 index 00000000000..9966c7d090e --- /dev/null +++ b/tests/queries/0_stateless/02876_splitby_max_substring_behavior.reference @@ -0,0 +1,126 @@ +-- splitByAlpha +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +['ab'] +['ab','cd'] +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +['ab.cd.ef.gh'] +['ab','cd.ef.gh'] +['ab','cd','ef.gh'] +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +['ab.cd.ef.gh'] +['ab','cd.ef.gh'] +-- splitByNonAlpha +['128','0','0','1'] +['128','0','0','1'] +['128','0','0','1'] +['128'] +['128','0'] +['128','0','0','1'] +['128','0','0','1'] +['128.0.0.1'] +['128','0.0.1'] +['128','0','0.1'] +['128','0','0','1'] +['128','0','0','1'] +['128','0','0','1'] +['128.0.0.1'] +['128','0.0.1'] +-- splitByWhitespace +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein,'] +['Nein,','nein,'] +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein, nein, nein! Doch!'] +['Nein,','nein, nein! Doch!'] +['Nein,','nein,','nein! Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein, nein, nein! Doch!'] +['Nein,','nein, nein! Doch!'] +-- splitByChar +['a','','b','c','d'] +['a','','b','c','d'] +['a','','b','c','d'] +['a'] +['a',''] +['a','','b','c','d'] +['a','','b','c','d'] +['a==b=c=d'] +['a','=b=c=d'] +['a','','b=c=d'] +['a','','b','c','d'] +['a','','b','c','d'] +['a','','b','c','d'] +['a==b=c=d'] +['a','=b=c=d'] +-- splitByString +['a','b=c=d'] +['a','b=c=d'] +['a','b=c=d'] +['a'] +['a','b=c=d'] +['a','b=c=d'] +['a','b=c=d'] +['a==b=c=d'] +['a','b=c=d'] +['a','b=c=d'] +['a','b=c=d'] +['a','b=c=d'] +['a','b=c=d'] +['a==b=c=d'] +['a','b=c=d'] +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a'] +['a','='] +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a==b=c=d'] +['a','==b=c=d'] +['a','=','=b=c=d'] +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a==b=c=d'] +['a','==b=c=d'] +-- splitByRegexp +['a','bc','de','f'] +['a','bc','de','f'] +['a','bc','de','f'] +['a'] +['a','bc'] +['a','bc','de','f'] +['a','bc','de','f'] +['a12bc23de345f'] +['a','bc23de345f'] +['a','bc','de345f'] +['a','bc','de','f'] +['a','bc','de','f'] +['a','bc','de','f'] +['a12bc23de345f'] +['a','bc23de345f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a'] +['a','1'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a12bc23de345f'] +['a','12bc23de345f'] +['a','1','2bc23de345f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a12bc23de345f'] +['a','12bc23de345f'] diff --git a/tests/queries/0_stateless/02876_splitby_max_substring_behavior.sql b/tests/queries/0_stateless/02876_splitby_max_substring_behavior.sql new file mode 100644 index 00000000000..1dcad65f09b --- /dev/null +++ b/tests/queries/0_stateless/02876_splitby_max_substring_behavior.sql @@ -0,0 +1,151 @@ +SELECT '-- splitByAlpha'; +SELECT splitByAlpha('ab.cd.ef.gh') SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByAlpha('ab.cd.ef.gh', -1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByAlpha('ab.cd.ef.gh', 0) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByAlpha('ab.cd.ef.gh', 1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByAlpha('ab.cd.ef.gh', 2) SETTINGS splitby_max_substring_behavior = ''; + +SELECT splitByAlpha('ab.cd.ef.gh') SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByAlpha('ab.cd.ef.gh', -1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByAlpha('ab.cd.ef.gh', 0) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByAlpha('ab.cd.ef.gh', 1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByAlpha('ab.cd.ef.gh', 2) SETTINGS splitby_max_substring_behavior = 'python'; + +SELECT splitByAlpha('ab.cd.ef.gh') SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByAlpha('ab.cd.ef.gh', -1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByAlpha('ab.cd.ef.gh', 0) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByAlpha('ab.cd.ef.gh', 1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByAlpha('ab.cd.ef.gh', 2) SETTINGS splitby_max_substring_behavior = 'spark'; + +SELECT '-- splitByNonAlpha'; +SELECT splitByNonAlpha('128.0.0.1') SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByNonAlpha('128.0.0.1', -1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByNonAlpha('128.0.0.1', 0) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByNonAlpha('128.0.0.1', 1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByNonAlpha('128.0.0.1', 2) SETTINGS splitby_max_substring_behavior = ''; + +SELECT splitByNonAlpha('128.0.0.1') SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByNonAlpha('128.0.0.1', -1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByNonAlpha('128.0.0.1', 0) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByNonAlpha('128.0.0.1', 1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByNonAlpha('128.0.0.1', 2) SETTINGS splitby_max_substring_behavior = 'python'; + +SELECT splitByNonAlpha('128.0.0.1') SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByNonAlpha('128.0.0.1', -1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByNonAlpha('128.0.0.1', 0) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByNonAlpha('128.0.0.1', 1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByNonAlpha('128.0.0.1', 2) SETTINGS splitby_max_substring_behavior = 'spark'; + +SELECT '-- splitByWhitespace'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!') SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', -1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 0) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) SETTINGS splitby_max_substring_behavior = ''; + +SELECT splitByWhitespace('Nein, nein, nein! Doch!') SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', -1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 0) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) SETTINGS splitby_max_substring_behavior = 'python'; + +SELECT splitByWhitespace('Nein, nein, nein! Doch!') SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', -1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 0) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) SETTINGS splitby_max_substring_behavior = 'spark'; + +SELECT '-- splitByChar'; +SELECT splitByChar('=', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByChar('=', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByChar('=', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByChar('=', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByChar('=', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = ''; + +SELECT splitByChar('=', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByChar('=', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByChar('=', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByChar('=', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByChar('=', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'python'; + +SELECT splitByChar('=', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByChar('=', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByChar('=', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByChar('=', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByChar('=', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'spark'; + +SELECT '-- splitByString'; + +SELECT splitByString('==', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByString('==', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByString('==', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByString('==', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByString('==', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = ''; + +SELECT splitByString('==', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByString('==', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByString('==', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByString('==', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByString('==', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'python'; + +SELECT splitByString('==', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByString('==', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByString('==', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByString('==', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByString('==', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'spark'; + +SELECT splitByString('', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByString('', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByString('', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByString('', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByString('', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = ''; + +SELECT splitByString('', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByString('', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByString('', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByString('', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByString('', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'python'; + +SELECT splitByString('', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByString('', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByString('', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByString('', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByString('', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'spark'; + +SELECT '-- splitByRegexp'; + +SELECT splitByRegexp('\\d+', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = ''; + +SELECT splitByRegexp('\\d+', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = 'python'; + +SELECT splitByRegexp('\\d+', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = 'spark'; + +SELECT splitByRegexp('', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByRegexp('', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByRegexp('', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByRegexp('', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = ''; +SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = ''; + +SELECT splitByRegexp('', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByRegexp('', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByRegexp('', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByRegexp('', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = 'python'; +SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = 'python'; + +SELECT splitByRegexp('', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByRegexp('', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByRegexp('', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByRegexp('', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = 'spark'; +SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = 'spark'; diff --git a/tests/queries/0_stateless/02876_splitby_spark_vs_python.reference b/tests/queries/0_stateless/02876_splitby_spark_vs_python.reference deleted file mode 100644 index 0c73fd7de76..00000000000 --- a/tests/queries/0_stateless/02876_splitby_spark_vs_python.reference +++ /dev/null @@ -1,22 +0,0 @@ -splitByAlpha -['ab','cd'] -['ab','cd','ef.gh'] -splitByNonAlpha -['128','0'] -['128','0','0.1'] -splitByWhitespace -['Nein,','nein,'] -['Nein,','nein,','nein! Doch!'] -splitByChar -['a','b'] -['a','b','c=d'] -splitByString -['a','='] -['a','=','=b==c==d'] -['a','b'] -['a','b','c==d'] -splitByRegexp -['a','1'] -['a','1','2bc23de345f'] -['a','bc'] -['a','bc','de345f'] diff --git a/tests/queries/0_stateless/02876_splitby_spark_vs_python.sql b/tests/queries/0_stateless/02876_splitby_spark_vs_python.sql deleted file mode 100644 index c550f69bd0c..00000000000 --- a/tests/queries/0_stateless/02876_splitby_spark_vs_python.sql +++ /dev/null @@ -1,27 +0,0 @@ -SELECT 'splitByAlpha'; -SELECT splitByAlpha('ab.cd.ef.gh', 2) settings split_tokens_like_python = 0; -SELECT splitByAlpha('ab.cd.ef.gh', 2) settings split_tokens_like_python = 1; - -SELECT 'splitByNonAlpha'; -SELECT splitByNonAlpha('128.0.0.1', 2) settings split_tokens_like_python = 0; -SELECT splitByNonAlpha('128.0.0.1', 2) settings split_tokens_like_python = 1; - -SELECT 'splitByWhitespace'; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) settings split_tokens_like_python = 0; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) settings split_tokens_like_python = 1; - -SELECT 'splitByChar'; -SELECT splitByChar('=', 'a=b=c=d', 2) SETTINGS split_tokens_like_python = 0; -SELECT splitByChar('=', 'a=b=c=d', 2) SETTINGS split_tokens_like_python = 1; - -SELECT 'splitByString'; -SELECT splitByString('', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 0; -SELECT splitByString('', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 1; -SELECT splitByString('==', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 0; -SELECT splitByString('==', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 1; - -SELECT 'splitByRegexp'; -SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 0; -SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 1; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 0; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 1; From cf12563df18bad5cf934b851fafb8e3ae07e9bcd Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 13 Sep 2023 12:55:17 +0000 Subject: [PATCH 4/7] Fix style check --- src/Functions/FunctionsStringArray.cpp | 3 +-- src/Functions/FunctionsStringArray.h | 2 -- src/Functions/URL/URLHierarchy.cpp | 4 ---- src/Functions/URL/URLPathHierarchy.cpp | 4 ---- src/Functions/URL/extractURLParameterNames.cpp | 4 ---- src/Functions/URL/extractURLParameters.cpp | 4 ---- 6 files changed, 1 insertion(+), 20 deletions(-) diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp index 085cb2c8eae..326651c111d 100644 --- a/src/Functions/FunctionsStringArray.cpp +++ b/src/Functions/FunctionsStringArray.cpp @@ -5,8 +5,7 @@ namespace DB { namespace ErrorCodes { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_COLUMN; } template diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 8b732292a1c..e720fc96e52 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -23,10 +23,8 @@ namespace DB namespace ErrorCodes { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int BAD_ARGUMENTS; extern const int ILLEGAL_COLUMN; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } diff --git a/src/Functions/URL/URLHierarchy.cpp b/src/Functions/URL/URLHierarchy.cpp index 5bd5629992f..260053dc401 100644 --- a/src/Functions/URL/URLHierarchy.cpp +++ b/src/Functions/URL/URLHierarchy.cpp @@ -3,10 +3,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} class URLPathHierarchyImpl { diff --git a/src/Functions/URL/URLPathHierarchy.cpp b/src/Functions/URL/URLPathHierarchy.cpp index 714f56ece36..a11be358a70 100644 --- a/src/Functions/URL/URLPathHierarchy.cpp +++ b/src/Functions/URL/URLPathHierarchy.cpp @@ -3,10 +3,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} class URLHierarchyImpl { diff --git a/src/Functions/URL/extractURLParameterNames.cpp b/src/Functions/URL/extractURLParameterNames.cpp index 3d40013335a..2b79be07cae 100644 --- a/src/Functions/URL/extractURLParameterNames.cpp +++ b/src/Functions/URL/extractURLParameterNames.cpp @@ -3,10 +3,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} class ExtractURLParameterNamesImpl { diff --git a/src/Functions/URL/extractURLParameters.cpp b/src/Functions/URL/extractURLParameters.cpp index 82df7888196..271e5dc89c9 100644 --- a/src/Functions/URL/extractURLParameters.cpp +++ b/src/Functions/URL/extractURLParameters.cpp @@ -3,10 +3,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} class ExtractURLParametersImpl { From 40e272521b2cf3cbad3c6a303d2a08a3a1126cc6 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 13 Sep 2023 12:55:51 +0000 Subject: [PATCH 5/7] Fix spell check --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index bcb971951e5..b2818a50a57 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2215,6 +2215,7 @@ sparkBar sparkbar sparsehash speedscope +splitby splitByChar splitByNonAlpha splitByRegexp From b583b80733b7409aa141742a3867291b514a64ca Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 14 Sep 2023 10:23:40 +0000 Subject: [PATCH 6/7] Fix spelling --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index b2818a50a57..b3084e8f298 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2215,6 +2215,7 @@ sparkBar sparkbar sparsehash speedscope +splitBy splitby splitByChar splitByNonAlpha From 774c4b52dadbd0fbb2430d2abbf62c3b630204ef Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 18 Sep 2023 20:08:37 +0000 Subject: [PATCH 7/7] Rework --- docs/en/operations/settings/settings.md | 11 +- .../functions/splitting-merging-functions.md | 16 +- src/Core/Settings.h | 2 +- src/Functions/FunctionsStringArray.cpp | 22 +- src/Functions/FunctionsStringArray.h | 339 +++++------------- src/Functions/URL/URLHierarchy.cpp | 2 +- src/Functions/URL/URLPathHierarchy.cpp | 2 +- .../URL/extractURLParameterNames.cpp | 2 +- src/Functions/URL/extractURLParameters.cpp | 2 +- .../02475_split_with_max_substrings.reference | 204 ++++++++--- .../02475_split_with_max_substrings.sql | 226 +++++++++--- ...6_splitby_max_substring_behavior.reference | 126 ------- .../02876_splitby_max_substring_behavior.sql | 151 -------- 13 files changed, 446 insertions(+), 659 deletions(-) delete mode 100644 tests/queries/0_stateless/02876_splitby_max_substring_behavior.reference delete mode 100644 tests/queries/0_stateless/02876_splitby_max_substring_behavior.sql diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index ad1437ea3eb..ef4703e3bc3 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4067,17 +4067,16 @@ Result: └─────┴─────┴───────┘ ``` -## splitby_max_substring_behavior {#splitby-max-substring-behavior} +## splitby_max_substrings_includes_remaining_string {#splitby_max_substrings_includes_remaining_string} -Controls how functions [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with given `max_substring` argument behave. +Controls whether function [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with argument `max_substrings` > 0 will include the remaining string in the last element of the result array. Possible values: -- `''` - If `max_substring` >=1, return the first `max_substring`-many splits. -- `'python'` - If `max_substring` >= 0, split `max_substring`-many times, and return `max_substring + 1` elements where the last element contains the remaining string. -- `'spark'` - If `max_substring` >= 1, split `max_substring`-many times, and return `max_substring + 1` elements where the last element contains the remaining string. +- `0` - The remaining string will not be included in the last element of the result array. +- `1` - The remaining string will be included in the last element of the result array. This is the behavior of Spark's [`split()`](https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.functions.split.html) function and Python's ['string.split()'](https://docs.python.org/3/library/stdtypes.html#str.split) method. -Default value: ``. +Default value: `0` ## enable_extended_results_for_datetime_functions {#enable-extended-results-for-datetime-functions} diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index 1e0bc3da664..614bf556c8e 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -21,7 +21,7 @@ splitByChar(separator, s[, max_substrings])) - `separator` — The separator which should contain exactly one character. [String](../../sql-reference/data-types/string.md). - `s` — The string to split. [String](../../sql-reference/data-types/string.md). -- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. +- `max_substrings` — An optional `Int64` defaulting to 0. If `max_substrings` > 0, the returned array will contain at most `max_substrings` substrings, otherwise the function will return as many substrings as possible. **Returned value(s)** @@ -39,7 +39,9 @@ For example, - in v22.10: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b','c=d']` - in v22.11: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b']` -The previous behavior can be restored by setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) = 'python'. +A behavior similar to ClickHouse pre-v22.11 can be achieved by setting +[splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) +`SELECT splitByChar('=', 'a=b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1 -- ['a', 'b=c=d']` ::: **Example** @@ -82,7 +84,7 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere - There are multiple consecutive non-empty separators; - The original string `s` is empty while the separator is not empty. -Setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) (default: '') controls the behavior with `max_substrings` > 0. +Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. **Example** @@ -137,7 +139,7 @@ Returns an array of selected substrings. Empty substrings may be selected when: Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -Setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) (default: '') controls the behavior with `max_substrings` > 0. +Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. **Example** @@ -188,7 +190,7 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -Setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) (default: '') controls the behavior with `max_substrings` > 0. +Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. **Example** @@ -227,7 +229,7 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -Setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) (default: '') controls the behavior with `max_substrings` > 0. +Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. **Example** @@ -289,7 +291,7 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -Setting [splitby_max_substring_behavior](../../operations/settings/settings.md#splitby-max-substring-behavior) (default: '') controls the behavior with `max_substrings` > 0. +Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. **Example** diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ca8f82ed8b6..fe9f50baf20 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -502,7 +502,7 @@ class IColumn; M(Bool, reject_expensive_hyperscan_regexps, true, "Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion)", 0) \ M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \ M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \ - M(String, splitby_max_substring_behavior, "", "Control the behavior of the 'max_substring' argument in functions splitBy*(): '' (default), 'python' or 'spark'", 0) \ + M(Bool, splitby_max_substrings_includes_remaining_string, false, "Functions 'splitBy*()' with 'max_substrings' argument > 0 include the remaining string as last element in the result", 0) \ \ M(Bool, allow_execute_multiif_columnar, true, "Allow execute multiIf function columnar", 0) \ M(Bool, formatdatetime_f_prints_single_zero, false, "Formatter '%f' in function 'formatDateTime()' produces a single zero instead of six zeros if the formatted value has no fractional seconds.", 0) \ diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp index 326651c111d..4afee55704f 100644 --- a/src/Functions/FunctionsStringArray.cpp +++ b/src/Functions/FunctionsStringArray.cpp @@ -19,7 +19,7 @@ std::optional extractMaxSplitsImpl(const ColumnWithTypeAndName & argument return static_cast(value); } -std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position, MaxSubstringBehavior max_substring_behavior) +std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position) { if (max_substrings_argument_position >= arguments.size()) return std::nullopt; @@ -35,24 +35,8 @@ std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, arguments[max_substrings_argument_position].column->getName(), max_substrings_argument_position + 1); - if (max_splits) - switch (max_substring_behavior) - { - case MaxSubstringBehavior::LikeClickHouse: - case MaxSubstringBehavior::LikeSpark: - { - if (*max_splits <= 0) - return std::nullopt; - break; - } - case MaxSubstringBehavior::LikePython: - { - if (*max_splits < 0) - return std::nullopt; - break; - } - } - + if (*max_splits <= 0) + return std::nullopt; return max_splits; } diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index e720fc96e52..d7d7e3b5100 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -54,14 +54,7 @@ namespace ErrorCodes using Pos = const char *; -enum class MaxSubstringBehavior -{ - LikeClickHouse, - LikeSpark, - LikePython -}; - -std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position, MaxSubstringBehavior max_substring_behavior); +std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position); /// Substring generators. All of them have a common interface. @@ -72,7 +65,7 @@ private: Pos end; std::optional max_splits; size_t splits; - MaxSubstringBehavior max_substring_behavior; + bool max_substrings_includes_remaining_string; public: static constexpr auto name = "alphaTokens"; @@ -97,10 +90,10 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) { - max_substring_behavior = max_substring_behavior_; - max_splits = extractMaxSplits(arguments, 1, max_substring_behavior); + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 1); } /// Called for each next string. @@ -125,35 +118,18 @@ public: if (max_splits) { - switch (max_substring_behavior) + if (max_substrings_includes_remaining_string) { - case MaxSubstringBehavior::LikeClickHouse: + if (splits == *max_splits - 1) { - if (splits == *max_splits) - return false; - break; - } - case MaxSubstringBehavior::LikeSpark: - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = end; - return true; - } - break; - } - case MaxSubstringBehavior::LikePython: - { - if (splits == *max_splits) - { - token_end = end; - pos = end; - return true; - } - break; + token_end = end; + pos = end; + return true; } } + else + if (splits == *max_splits) + return false; } while (pos < end && isAlphaASCII(*pos)) @@ -173,7 +149,7 @@ private: Pos end; std::optional max_splits; size_t splits; - MaxSubstringBehavior max_substring_behavior; + bool max_substrings_includes_remaining_string; public: /// Get the name of the function. @@ -190,10 +166,10 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) { - max_substring_behavior = max_substring_behavior_; - max_splits = extractMaxSplits(arguments, 1, max_substring_behavior); + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 1); } /// Called for each next string. @@ -218,35 +194,18 @@ public: if (max_splits) { - switch (max_substring_behavior) + if (max_substrings_includes_remaining_string) { - case MaxSubstringBehavior::LikeClickHouse: + if (splits == *max_splits - 1) { - if (splits == *max_splits) - return false; - break; - } - case MaxSubstringBehavior::LikeSpark: - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = end; - return true; - } - break; - } - case MaxSubstringBehavior::LikePython: - { - if (splits == *max_splits) - { - token_end = end; - pos = end; - return true; - } - break; + token_end = end; + pos = end; + return true; } } + else + if (splits == *max_splits) + return false; } while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) @@ -266,7 +225,7 @@ private: Pos end; std::optional max_splits; size_t splits; - MaxSubstringBehavior max_substring_behavior; + bool max_substrings_includes_remaining_string; public: static constexpr auto name = "splitByWhitespace"; @@ -282,10 +241,10 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) { - max_substring_behavior = max_substring_behavior_; - max_splits = extractMaxSplits(arguments, 1, max_substring_behavior); + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 1); } /// Called for each next string. @@ -310,35 +269,18 @@ public: if (max_splits) { - switch (max_substring_behavior) + if (max_substrings_includes_remaining_string) { - case MaxSubstringBehavior::LikeClickHouse: + if (splits == *max_splits - 1) { - if (splits == *max_splits) - return false; - break; - } - case MaxSubstringBehavior::LikeSpark: - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = end; - return true; - } - break; - } - case MaxSubstringBehavior::LikePython: - { - if (splits == *max_splits) - { - token_end = end; - pos = end; - return true; - } - break; + token_end = end; + pos = end; + return true; } } + else + if (splits == *max_splits) + return false; } while (pos < end && !isWhitespaceASCII(*pos)) @@ -359,7 +301,7 @@ private: char separator; std::optional max_splits; size_t splits; - MaxSubstringBehavior max_substring_behavior; + bool max_substrings_includes_remaining_string; public: static constexpr auto name = "splitByChar"; @@ -383,7 +325,7 @@ public: static constexpr auto strings_argument_position = 1uz; - void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -398,8 +340,8 @@ public: separator = sep_str[0]; - max_substring_behavior = max_substring_behavior_; - max_splits = extractMaxSplits(arguments, 2, max_substring_behavior); + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 2); } void set(Pos pos_, Pos end_) @@ -418,35 +360,18 @@ public: if (max_splits) { - switch (max_substring_behavior) + if (max_substrings_includes_remaining_string) { - case MaxSubstringBehavior::LikeClickHouse: + if (splits == *max_splits - 1) { - if (splits == *max_splits) - return false; - break; - } - case MaxSubstringBehavior::LikeSpark: - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = nullptr; - return true; - } - break; - } - case MaxSubstringBehavior::LikePython: - { - if (splits == *max_splits) - { - token_end = end; - pos = nullptr; - return true; - } - break; + token_end = end; + pos = nullptr; + return true; } } + else + if (splits == *max_splits) + return false; } pos = reinterpret_cast(memchr(pos, separator, end - pos)); @@ -472,7 +397,7 @@ private: String separator; std::optional max_splits; size_t splits; - MaxSubstringBehavior max_substring_behavior; + bool max_substrings_includes_remaining_string; public: static constexpr auto name = "splitByString"; @@ -487,7 +412,7 @@ public: static constexpr auto strings_argument_position = 1uz; - void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -497,8 +422,8 @@ public: separator = col->getValue(); - max_substring_behavior = max_substring_behavior_; - max_splits = extractMaxSplits(arguments, 2, max_substring_behavior); + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 2); } /// Called for each next string. @@ -521,35 +446,18 @@ public: if (max_splits) { - switch (max_substring_behavior) + if (max_substrings_includes_remaining_string) { - case MaxSubstringBehavior::LikeClickHouse: + if (splits == *max_splits - 1) { - if (splits == *max_splits) - return false; - break; - } - case MaxSubstringBehavior::LikeSpark: - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = end; - return true; - } - break; - } - case MaxSubstringBehavior::LikePython: - { - if (splits == *max_splits) - { - token_end = end; - pos = end; - return true; - } - break; + token_end = end; + pos = end; + return true; } } + else + if (splits == *max_splits) + return false; } pos += 1; @@ -565,35 +473,18 @@ public: if (max_splits) { - switch (max_substring_behavior) + if (max_substrings_includes_remaining_string) { - case MaxSubstringBehavior::LikeClickHouse: + if (splits == *max_splits - 1) { - if (splits == *max_splits) - return false; - break; - } - case MaxSubstringBehavior::LikeSpark: - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = nullptr; - return true; - } - break; - } - case MaxSubstringBehavior::LikePython: - { - if (splits == *max_splits) - { - token_end = end; - pos = nullptr; - return true; - } - break; + token_end = end; + pos = nullptr; + return true; } } + else + if (splits == *max_splits) + return false; } pos = reinterpret_cast(memmem(pos, end - pos, separator.data(), separator.size())); @@ -622,7 +513,7 @@ private: std::optional max_splits; size_t splits; - MaxSubstringBehavior max_substring_behavior; + bool max_substrings_includes_remaining_string; public: static constexpr auto name = "splitByRegexp"; @@ -638,7 +529,7 @@ public: static constexpr auto strings_argument_position = 1uz; - void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior max_substring_behavior_) + void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -649,8 +540,8 @@ public: if (!col->getValue().empty()) re = std::make_shared(Regexps::createRegexp(col->getValue())); - max_substring_behavior = max_substring_behavior_; - max_splits = extractMaxSplits(arguments, 2, max_substring_behavior); + max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; + max_splits = extractMaxSplits(arguments, 2); } /// Called for each next string. @@ -673,35 +564,18 @@ public: if (max_splits) { - switch (max_substring_behavior) + if (max_substrings_includes_remaining_string) { - case MaxSubstringBehavior::LikeClickHouse: + if (splits == *max_splits - 1) { - if (splits == *max_splits) - return false; - break; - } - case MaxSubstringBehavior::LikeSpark: - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = end; - return true; - } - break; - } - case MaxSubstringBehavior::LikePython: - { - if (splits == *max_splits) - { - token_end = end; - pos = end; - return true; - } - break; + token_end = end; + pos = end; + return true; } } + else + if (splits == *max_splits) + return false; } pos += 1; @@ -717,35 +591,18 @@ public: if (max_splits) { - switch (max_substring_behavior) + if (max_substrings_includes_remaining_string) { - case MaxSubstringBehavior::LikeClickHouse: + if (splits == *max_splits - 1) { - if (splits == *max_splits) - return false; - break; - } - case MaxSubstringBehavior::LikeSpark: - { - if (splits == *max_splits - 1) - { - token_end = end; - pos = nullptr; - return true; - } - break; - } - case MaxSubstringBehavior::LikePython: - { - if (splits == *max_splits) - { - token_end = end; - pos = nullptr; - return true; - } - break; + token_end = end; + pos = nullptr; + return true; } } + else + if (splits == *max_splits) + return false; } if (!re->match(pos, end - pos, matches) || !matches[0].length) @@ -792,7 +649,7 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & arguments, MaxSubstringBehavior /*max_substring_behavior*/) + void init(const ColumnsWithTypeAndName & arguments, bool /*max_substrings_includes_remaining_string*/) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); @@ -845,7 +702,7 @@ template class FunctionTokens : public IFunction { private: - MaxSubstringBehavior max_substring_behavior; + bool max_substrings_includes_remaining_string; public: static constexpr auto name = Generator::name; @@ -854,17 +711,7 @@ public: explicit FunctionTokens(ContextPtr context) { const Settings & settings = context->getSettingsRef(); - if (settings.splitby_max_substring_behavior.value == "") - max_substring_behavior = MaxSubstringBehavior::LikeClickHouse; - else if (settings.splitby_max_substring_behavior.value == "python") - max_substring_behavior = MaxSubstringBehavior::LikePython; - else if (settings.splitby_max_substring_behavior.value == "spark") - max_substring_behavior = MaxSubstringBehavior::LikeSpark; - else - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Illegal value {} for setting splitby_max_substring_behavior in function {}, must be '', 'python' or 'spark'", - settings.splitby_max_substring_behavior.value, getName()); + max_substrings_includes_remaining_string = settings.splitby_max_substrings_includes_remaining_string; } String getName() const override { return name; } @@ -885,7 +732,7 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override { Generator generator; - generator.init(arguments, max_substring_behavior); + generator.init(arguments, max_substrings_includes_remaining_string); const auto & array_argument = arguments[generator.strings_argument_position]; diff --git a/src/Functions/URL/URLHierarchy.cpp b/src/Functions/URL/URLHierarchy.cpp index 260053dc401..96b64d3182b 100644 --- a/src/Functions/URL/URLHierarchy.cpp +++ b/src/Functions/URL/URLHierarchy.cpp @@ -30,7 +30,7 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & /*arguments*/, MaxSubstringBehavior /*max_substring_behavior*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substrings_includes_remaining_string*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/URLPathHierarchy.cpp b/src/Functions/URL/URLPathHierarchy.cpp index a11be358a70..7fd6601d780 100644 --- a/src/Functions/URL/URLPathHierarchy.cpp +++ b/src/Functions/URL/URLPathHierarchy.cpp @@ -29,7 +29,7 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & /*arguments*/, MaxSubstringBehavior /*max_substring_behavior*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substring_behavior*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/extractURLParameterNames.cpp b/src/Functions/URL/extractURLParameterNames.cpp index 2b79be07cae..b792d9140d6 100644 --- a/src/Functions/URL/extractURLParameterNames.cpp +++ b/src/Functions/URL/extractURLParameterNames.cpp @@ -29,7 +29,7 @@ public: static constexpr auto strings_argument_position = 0uz; - void init(const ColumnsWithTypeAndName & /*arguments*/, MaxSubstringBehavior /*max_substring_behavior*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substrings_includes_remaining_string*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/extractURLParameters.cpp b/src/Functions/URL/extractURLParameters.cpp index 271e5dc89c9..e1243d8fbcd 100644 --- a/src/Functions/URL/extractURLParameters.cpp +++ b/src/Functions/URL/extractURLParameters.cpp @@ -27,7 +27,7 @@ public: validateFunctionArgumentTypes(func, arguments, mandatory_args); } - void init(const ColumnsWithTypeAndName & /*arguments*/, MaxSubstringBehavior /*max_substring_behavior*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substrings_includes_remaining_string*/) {} static constexpr auto strings_argument_position = 0uz; diff --git a/tests/queries/0_stateless/02475_split_with_max_substrings.reference b/tests/queries/0_stateless/02475_split_with_max_substrings.reference index d55ef45a5e0..904441f83fa 100644 --- a/tests/queries/0_stateless/02475_split_with_max_substrings.reference +++ b/tests/queries/0_stateless/02475_split_with_max_substrings.reference @@ -1,44 +1,160 @@ -['1','2','3'] -['1','2','3'] -['1','2','3'] -['1'] -['1','2'] -['1','2','3'] -['1','2','3'] -['one','two','three',''] -['one','two','three',''] -['one','two','three',''] -['one'] -['one','two'] -['one','two','three'] -['one','two','three',''] -['one','two','three',''] -['abca','abc'] -['abca','abc'] -['abca','abc'] -['abca'] -['abca','abc'] -['abca','abc'] -['abca','abc'] -['1','a','b'] -['1','a','b'] -['1','a','b'] -['1'] -['1','a'] -['1','a','b'] -['1','a','b'] -['1!','a,','b.'] -['1!','a,','b.'] -['1!','a,','b.'] -['1!'] -['1!','a,'] -['1!','a,','b.'] -['1!','a,','b.'] -['1','2 3','4,5','abcde'] -['1','2 3','4,5','abcde'] -['1','2 3','4,5','abcde'] -['1'] -['1','2 3'] -['1','2 3','4,5'] -['1','2 3','4,5','abcde'] -['1','2 3','4,5','abcde'] +-- negative tests +-- splitByChar +-- (default) +['a','','b','c','d'] +['a','','b','c','d'] +['a','','b','c','d'] +['a'] +['a',''] +['a','','b'] +['a','','b','c'] +['a','','b','c','d'] +['a','','b','c','d'] +-- (include remainder) +['a','','b','c','d'] +['a','','b','c','d'] +['a','','b','c','d'] +['a==b=c=d'] +['a','=b=c=d'] +['a','','b=c=d'] +['a','','b','c=d'] +['a','','b','c','d'] +['a','','b','c','d'] +-- splitByString +-- (default) +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a'] +['a','='] +['a','=','='] +['a','=','=','b'] +['a','=','=','b','='] +['a','=','=','b','=','c'] +['a','=','=','b','=','c','='] +['a','=','=','b','=','c','='] +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a','','b','c','d'] +['a','','b','c','d'] +['a','','b','c','d'] +['a'] +['a',''] +['a','','b'] +['a','','b','c'] +['a','','b','c','d'] +['a','','b','c','d'] +-- (include remainder) +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a==b=c=d'] +['a','==b=c=d'] +['a','=','=b=c=d'] +['a','=','=','b=c=d'] +['a','=','=','b','=c=d'] +['a','=','=','b','=','c=d'] +['a','=','=','b','=','c','=d'] +['a','=','=','b','=','c','=','d'] +['a','=','=','b','=','c','=','d'] +['a','','b','c','d'] +['a','','b','c','d'] +['a','','b','c','d'] +['a==b=c=d'] +['a','=b=c=d'] +['a','','b=c=d'] +['a','','b','c=d'] +['a','','b','c','d'] +['a','','b','c','d'] +-- splitByRegexp +-- (default) +['a','bc','de','f'] +['a','bc','de','f'] +['a','bc','de','f'] +['a'] +['a','bc'] +['a','bc','de'] +['a','bc','de','f'] +['a','bc','de','f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a'] +['a','1'] +['a','1','2'] +['a','1','2','b'] +['a','1','2','b','c'] +-- (include remainder) +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a','1','2','b','c','2','3','d','e','3','4','5','f'] +['a12bc23de345f'] +['a','12bc23de345f'] +['a','1','2bc23de345f'] +['a','1','2','bc23de345f'] +['a','1','2','b','c23de345f'] +['a','bc','de','f'] +['a','bc','de','f'] +['a','bc','de','f'] +['a12bc23de345f'] +['a','bc23de345f'] +['a','bc','de345f'] +['a','bc','de','f'] +['a','bc','de','f'] +-- splitByAlpha +-- (default) +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +['ab'] +['ab','cd'] +['ab','cd','ef'] +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +-- (include remainder) +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +['ab.cd.ef.gh'] +['ab','cd.ef.gh'] +['ab','cd','ef.gh'] +['ab','cd','ef','gh'] +['ab','cd','ef','gh'] +-- splitByNonAlpha +-- (default) +['128','0','0','1'] +['128','0','0','1'] +['128','0','0','1'] +['128'] +['128','0'] +['128','0','0'] +['128','0','0','1'] +['128','0','0','1'] +-- (include remainder) +['128','0','0','1'] +['128','0','0','1'] +['128','0','0','1'] +['128.0.0.1'] +['128','0.0.1'] +['128','0','0.1'] +['128','0','0','1'] +['128','0','0','1'] +-- splitByWhitespace +-- (default) +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein,'] +['Nein,','nein,'] +['Nein,','nein,','nein!'] +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] +-- (include remainder) +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein, nein, nein! Doch!'] +['Nein,','nein, nein! Doch!'] +['Nein,','nein,','nein! Doch!'] +['Nein,','nein,','nein!','Doch!'] +['Nein,','nein,','nein!','Doch!'] diff --git a/tests/queries/0_stateless/02475_split_with_max_substrings.sql b/tests/queries/0_stateless/02475_split_with_max_substrings.sql index c51133c604e..3f367c75433 100644 --- a/tests/queries/0_stateless/02475_split_with_max_substrings.sql +++ b/tests/queries/0_stateless/02475_split_with_max_substrings.sql @@ -1,59 +1,175 @@ -select splitByChar(',', '1,2,3'); -select splitByChar(',', '1,2,3', -1); -select splitByChar(',', '1,2,3', 0); -select splitByChar(',', '1,2,3', 1); -select splitByChar(',', '1,2,3', 2); -select splitByChar(',', '1,2,3', 3); -select splitByChar(',', '1,2,3', 4); - -select splitByRegexp('[ABC]', 'oneAtwoBthreeC'); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', -1); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 0); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 1); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 2); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 3); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 4); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 5); - -SELECT alphaTokens('abca1abc'); -SELECT alphaTokens('abca1abc', -1); -SELECT alphaTokens('abca1abc', 0); -SELECT alphaTokens('abca1abc', 1); -SELECT alphaTokens('abca1abc', 2); -SELECT alphaTokens('abca1abc', 3); - -SELECT splitByAlpha('abca1abc'); - -SELECT splitByNonAlpha(' 1! a, b. '); -SELECT splitByNonAlpha(' 1! a, b. ', -1); -SELECT splitByNonAlpha(' 1! a, b. ', 0); -SELECT splitByNonAlpha(' 1! a, b. ', 1); -SELECT splitByNonAlpha(' 1! a, b. ', 2); -SELECT splitByNonAlpha(' 1! a, b. ', 3); -SELECT splitByNonAlpha(' 1! a, b. ', 4); - -SELECT splitByWhitespace(' 1! a, b. '); -SELECT splitByWhitespace(' 1! a, b. ', -1); -SELECT splitByWhitespace(' 1! a, b. ', 0); -SELECT splitByWhitespace(' 1! a, b. ', 1); -SELECT splitByWhitespace(' 1! a, b. ', 2); -SELECT splitByWhitespace(' 1! a, b. ', 3); -SELECT splitByWhitespace(' 1! a, b. ', 4); - -SELECT splitByString(', ', '1, 2 3, 4,5, abcde'); -SELECT splitByString(', ', '1, 2 3, 4,5, abcde', -1); -SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 0); -SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 1); -SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 2); -SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 3); -SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 4); -SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 5); - - -select splitByChar(',', '1,2,3', ''); -- { serverError 43 } -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', ''); -- { serverError 43 } +SELECT '-- negative tests'; +SELECT splitByChar(',', '1,2,3', ''); -- { serverError 43 } +SELECT splitByRegexp('[ABC]', 'oneAtwoBthreeC', ''); -- { serverError 43 } SELECT alphaTokens('abca1abc', ''); -- { serverError 43 } SELECT splitByAlpha('abca1abc', ''); -- { serverError 43 } SELECT splitByNonAlpha(' 1! a, b. ', ''); -- { serverError 43 } SELECT splitByWhitespace(' 1! a, b. ', ''); -- { serverError 43 } -SELECT splitByString(', ', '1, 2 3, 4,5, abcde', ''); -- { serverError 43 } \ No newline at end of file +SELECT splitByString(', ', '1, 2 3, 4,5, abcde', ''); -- { serverError 43 } + +SELECT '-- splitByChar'; +SELECT '-- (default)'; +SELECT splitByChar('=', 'a==b=c=d'); +SELECT splitByChar('=', 'a==b=c=d', -1); +SELECT splitByChar('=', 'a==b=c=d', 0); +SELECT splitByChar('=', 'a==b=c=d', 1); +SELECT splitByChar('=', 'a==b=c=d', 2); +SELECT splitByChar('=', 'a==b=c=d', 3); +SELECT splitByChar('=', 'a==b=c=d', 4); +SELECT splitByChar('=', 'a==b=c=d', 5); +SELECT splitByChar('=', 'a==b=c=d', 6); +SELECT '-- (include remainder)'; +SELECT splitByChar('=', 'a==b=c=d') SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByChar('=', 'a==b=c=d', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByChar('=', 'a==b=c=d', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByChar('=', 'a==b=c=d', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByChar('=', 'a==b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByChar('=', 'a==b=c=d', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByChar('=', 'a==b=c=d', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByChar('=', 'a==b=c=d', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByChar('=', 'a==b=c=d', 6) SETTINGS splitby_max_substrings_includes_remaining_string = 1; + +SELECT '-- splitByString'; +SELECT '-- (default)'; +SELECT splitByString('', 'a==b=c=d') SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 6) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 7) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 7) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 8) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('', 'a==b=c=d', 9) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('=', 'a==b=c=d') SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('=', 'a==b=c=d', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('=', 'a==b=c=d', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('=', 'a==b=c=d', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('=', 'a==b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('=', 'a==b=c=d', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('=', 'a==b=c=d', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('=', 'a==b=c=d', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT splitByString('=', 'a==b=c=d', 6) SETTINGS splitby_max_substrings_includes_remaining_string = 0; +SELECT '-- (include remainder)'; +SELECT splitByString('', 'a==b=c=d') SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', 6) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', 7) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', 8) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('', 'a==b=c=d', 9) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('=', 'a==b=c=d') SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('=', 'a==b=c=d', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('=', 'a==b=c=d', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('=', 'a==b=c=d', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('=', 'a==b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('=', 'a==b=c=d', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('=', 'a==b=c=d', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('=', 'a==b=c=d', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByString('=', 'a==b=c=d', 6) SETTINGS splitby_max_substrings_includes_remaining_string = 1; + + +SELECT '-- splitByRegexp'; +SELECT '-- (default)'; +SELECT splitByRegexp('\\d+', 'a12bc23de345f'); +SELECT splitByRegexp('\\d+', 'a12bc23de345f', -1); +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 0); +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 1); +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2); +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 3); +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 4); +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 5); +SELECT splitByRegexp('', 'a12bc23de345f'); +SELECT splitByRegexp('', 'a12bc23de345f', -1); +SELECT splitByRegexp('', 'a12bc23de345f', 0); +SELECT splitByRegexp('', 'a12bc23de345f', 1); +SELECT splitByRegexp('', 'a12bc23de345f', 2); +SELECT splitByRegexp('', 'a12bc23de345f', 3); +SELECT splitByRegexp('', 'a12bc23de345f', 4); +SELECT splitByRegexp('', 'a12bc23de345f', 5); +SELECT '-- (include remainder)'; +SELECT splitByRegexp('', 'a12bc23de345f') SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('', 'a12bc23de345f', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('', 'a12bc23de345f', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('', 'a12bc23de345f', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('', 'a12bc23de345f', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('', 'a12bc23de345f', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('', 'a12bc23de345f', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('\\d+', 'a12bc23de345f') SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1; + +SELECT '-- splitByAlpha'; +SELECT '-- (default)'; +SELECT splitByAlpha('ab.cd.ef.gh'); +SELECT splitByAlpha('ab.cd.ef.gh', -1); +SELECT splitByAlpha('ab.cd.ef.gh', 0); +SELECT splitByAlpha('ab.cd.ef.gh', 1); +SELECT splitByAlpha('ab.cd.ef.gh', 2); +SELECT splitByAlpha('ab.cd.ef.gh', 3); +SELECT splitByAlpha('ab.cd.ef.gh', 4); +SELECT splitByAlpha('ab.cd.ef.gh', 5); +SELECT '-- (include remainder)'; +SELECT splitByAlpha('ab.cd.ef.gh') SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByAlpha('ab.cd.ef.gh', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByAlpha('ab.cd.ef.gh', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByAlpha('ab.cd.ef.gh', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByAlpha('ab.cd.ef.gh', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByAlpha('ab.cd.ef.gh', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByAlpha('ab.cd.ef.gh', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByAlpha('ab.cd.ef.gh', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1; + +SELECT '-- splitByNonAlpha'; +SELECT '-- (default)'; +SELECT splitByNonAlpha('128.0.0.1'); +SELECT splitByNonAlpha('128.0.0.1', -1); +SELECT splitByNonAlpha('128.0.0.1', 0); +SELECT splitByNonAlpha('128.0.0.1', 1); +SELECT splitByNonAlpha('128.0.0.1', 2); +SELECT splitByNonAlpha('128.0.0.1', 3); +SELECT splitByNonAlpha('128.0.0.1', 4); +SELECT splitByNonAlpha('128.0.0.1', 5); +SELECT '-- (include remainder)'; +SELECT splitByNonAlpha('128.0.0.1') SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByNonAlpha('128.0.0.1', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByNonAlpha('128.0.0.1', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByNonAlpha('128.0.0.1', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByNonAlpha('128.0.0.1', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByNonAlpha('128.0.0.1', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByNonAlpha('128.0.0.1', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByNonAlpha('128.0.0.1', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +-- +-- +SELECT '-- splitByWhitespace'; +SELECT '-- (default)'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!'); +SELECT splitByWhitespace('Nein, nein, nein! Doch!', -1); +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 0); +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 1); +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2); +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 3); +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 4); +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 5); +SELECT '-- (include remainder)'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!') SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1; diff --git a/tests/queries/0_stateless/02876_splitby_max_substring_behavior.reference b/tests/queries/0_stateless/02876_splitby_max_substring_behavior.reference deleted file mode 100644 index 9966c7d090e..00000000000 --- a/tests/queries/0_stateless/02876_splitby_max_substring_behavior.reference +++ /dev/null @@ -1,126 +0,0 @@ --- splitByAlpha -['ab','cd','ef','gh'] -['ab','cd','ef','gh'] -['ab','cd','ef','gh'] -['ab'] -['ab','cd'] -['ab','cd','ef','gh'] -['ab','cd','ef','gh'] -['ab.cd.ef.gh'] -['ab','cd.ef.gh'] -['ab','cd','ef.gh'] -['ab','cd','ef','gh'] -['ab','cd','ef','gh'] -['ab','cd','ef','gh'] -['ab.cd.ef.gh'] -['ab','cd.ef.gh'] --- splitByNonAlpha -['128','0','0','1'] -['128','0','0','1'] -['128','0','0','1'] -['128'] -['128','0'] -['128','0','0','1'] -['128','0','0','1'] -['128.0.0.1'] -['128','0.0.1'] -['128','0','0.1'] -['128','0','0','1'] -['128','0','0','1'] -['128','0','0','1'] -['128.0.0.1'] -['128','0.0.1'] --- splitByWhitespace -['Nein,','nein,','nein!','Doch!'] -['Nein,','nein,','nein!','Doch!'] -['Nein,','nein,','nein!','Doch!'] -['Nein,'] -['Nein,','nein,'] -['Nein,','nein,','nein!','Doch!'] -['Nein,','nein,','nein!','Doch!'] -['Nein, nein, nein! Doch!'] -['Nein,','nein, nein! Doch!'] -['Nein,','nein,','nein! Doch!'] -['Nein,','nein,','nein!','Doch!'] -['Nein,','nein,','nein!','Doch!'] -['Nein,','nein,','nein!','Doch!'] -['Nein, nein, nein! Doch!'] -['Nein,','nein, nein! Doch!'] --- splitByChar -['a','','b','c','d'] -['a','','b','c','d'] -['a','','b','c','d'] -['a'] -['a',''] -['a','','b','c','d'] -['a','','b','c','d'] -['a==b=c=d'] -['a','=b=c=d'] -['a','','b=c=d'] -['a','','b','c','d'] -['a','','b','c','d'] -['a','','b','c','d'] -['a==b=c=d'] -['a','=b=c=d'] --- splitByString -['a','b=c=d'] -['a','b=c=d'] -['a','b=c=d'] -['a'] -['a','b=c=d'] -['a','b=c=d'] -['a','b=c=d'] -['a==b=c=d'] -['a','b=c=d'] -['a','b=c=d'] -['a','b=c=d'] -['a','b=c=d'] -['a','b=c=d'] -['a==b=c=d'] -['a','b=c=d'] -['a','=','=','b','=','c','=','d'] -['a','=','=','b','=','c','=','d'] -['a','=','=','b','=','c','=','d'] -['a'] -['a','='] -['a','=','=','b','=','c','=','d'] -['a','=','=','b','=','c','=','d'] -['a==b=c=d'] -['a','==b=c=d'] -['a','=','=b=c=d'] -['a','=','=','b','=','c','=','d'] -['a','=','=','b','=','c','=','d'] -['a','=','=','b','=','c','=','d'] -['a==b=c=d'] -['a','==b=c=d'] --- splitByRegexp -['a','bc','de','f'] -['a','bc','de','f'] -['a','bc','de','f'] -['a'] -['a','bc'] -['a','bc','de','f'] -['a','bc','de','f'] -['a12bc23de345f'] -['a','bc23de345f'] -['a','bc','de345f'] -['a','bc','de','f'] -['a','bc','de','f'] -['a','bc','de','f'] -['a12bc23de345f'] -['a','bc23de345f'] -['a','1','2','b','c','2','3','d','e','3','4','5','f'] -['a','1','2','b','c','2','3','d','e','3','4','5','f'] -['a','1','2','b','c','2','3','d','e','3','4','5','f'] -['a'] -['a','1'] -['a','1','2','b','c','2','3','d','e','3','4','5','f'] -['a','1','2','b','c','2','3','d','e','3','4','5','f'] -['a12bc23de345f'] -['a','12bc23de345f'] -['a','1','2bc23de345f'] -['a','1','2','b','c','2','3','d','e','3','4','5','f'] -['a','1','2','b','c','2','3','d','e','3','4','5','f'] -['a','1','2','b','c','2','3','d','e','3','4','5','f'] -['a12bc23de345f'] -['a','12bc23de345f'] diff --git a/tests/queries/0_stateless/02876_splitby_max_substring_behavior.sql b/tests/queries/0_stateless/02876_splitby_max_substring_behavior.sql deleted file mode 100644 index 1dcad65f09b..00000000000 --- a/tests/queries/0_stateless/02876_splitby_max_substring_behavior.sql +++ /dev/null @@ -1,151 +0,0 @@ -SELECT '-- splitByAlpha'; -SELECT splitByAlpha('ab.cd.ef.gh') SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByAlpha('ab.cd.ef.gh', -1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByAlpha('ab.cd.ef.gh', 0) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByAlpha('ab.cd.ef.gh', 1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByAlpha('ab.cd.ef.gh', 2) SETTINGS splitby_max_substring_behavior = ''; - -SELECT splitByAlpha('ab.cd.ef.gh') SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByAlpha('ab.cd.ef.gh', -1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByAlpha('ab.cd.ef.gh', 0) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByAlpha('ab.cd.ef.gh', 1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByAlpha('ab.cd.ef.gh', 2) SETTINGS splitby_max_substring_behavior = 'python'; - -SELECT splitByAlpha('ab.cd.ef.gh') SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByAlpha('ab.cd.ef.gh', -1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByAlpha('ab.cd.ef.gh', 0) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByAlpha('ab.cd.ef.gh', 1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByAlpha('ab.cd.ef.gh', 2) SETTINGS splitby_max_substring_behavior = 'spark'; - -SELECT '-- splitByNonAlpha'; -SELECT splitByNonAlpha('128.0.0.1') SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByNonAlpha('128.0.0.1', -1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByNonAlpha('128.0.0.1', 0) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByNonAlpha('128.0.0.1', 1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByNonAlpha('128.0.0.1', 2) SETTINGS splitby_max_substring_behavior = ''; - -SELECT splitByNonAlpha('128.0.0.1') SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByNonAlpha('128.0.0.1', -1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByNonAlpha('128.0.0.1', 0) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByNonAlpha('128.0.0.1', 1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByNonAlpha('128.0.0.1', 2) SETTINGS splitby_max_substring_behavior = 'python'; - -SELECT splitByNonAlpha('128.0.0.1') SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByNonAlpha('128.0.0.1', -1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByNonAlpha('128.0.0.1', 0) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByNonAlpha('128.0.0.1', 1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByNonAlpha('128.0.0.1', 2) SETTINGS splitby_max_substring_behavior = 'spark'; - -SELECT '-- splitByWhitespace'; -SELECT splitByWhitespace('Nein, nein, nein! Doch!') SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', -1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 0) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) SETTINGS splitby_max_substring_behavior = ''; - -SELECT splitByWhitespace('Nein, nein, nein! Doch!') SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', -1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 0) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) SETTINGS splitby_max_substring_behavior = 'python'; - -SELECT splitByWhitespace('Nein, nein, nein! Doch!') SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', -1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 0) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) SETTINGS splitby_max_substring_behavior = 'spark'; - -SELECT '-- splitByChar'; -SELECT splitByChar('=', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByChar('=', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByChar('=', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByChar('=', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByChar('=', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = ''; - -SELECT splitByChar('=', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByChar('=', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByChar('=', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByChar('=', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByChar('=', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'python'; - -SELECT splitByChar('=', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByChar('=', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByChar('=', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByChar('=', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByChar('=', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'spark'; - -SELECT '-- splitByString'; - -SELECT splitByString('==', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByString('==', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByString('==', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByString('==', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByString('==', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = ''; - -SELECT splitByString('==', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByString('==', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByString('==', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByString('==', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByString('==', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'python'; - -SELECT splitByString('==', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByString('==', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByString('==', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByString('==', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByString('==', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'spark'; - -SELECT splitByString('', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByString('', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByString('', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByString('', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByString('', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = ''; - -SELECT splitByString('', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByString('', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByString('', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByString('', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByString('', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'python'; - -SELECT splitByString('', 'a==b=c=d') SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByString('', 'a==b=c=d', -1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByString('', 'a==b=c=d', 0) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByString('', 'a==b=c=d', 1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByString('', 'a==b=c=d', 2) SETTINGS splitby_max_substring_behavior = 'spark'; - -SELECT '-- splitByRegexp'; - -SELECT splitByRegexp('\\d+', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = ''; - -SELECT splitByRegexp('\\d+', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = 'python'; - -SELECT splitByRegexp('\\d+', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = 'spark'; - -SELECT splitByRegexp('', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByRegexp('', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByRegexp('', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByRegexp('', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = ''; -SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = ''; - -SELECT splitByRegexp('', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByRegexp('', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByRegexp('', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByRegexp('', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = 'python'; -SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = 'python'; - -SELECT splitByRegexp('', 'a12bc23de345f') SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByRegexp('', 'a12bc23de345f', -1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByRegexp('', 'a12bc23de345f', 0) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByRegexp('', 'a12bc23de345f', 1) SETTINGS splitby_max_substring_behavior = 'spark'; -SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS splitby_max_substring_behavior = 'spark';