diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index e9e5920fa59..227483758d5 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4067,6 +4067,17 @@ Result: └─────┴─────┴───────┘ ``` +## split_tokens_like_python {#split-tokens-like-python} + +Controls if functions [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with `max_substring` argument > 0 include the remaining string (if any) in the result array (Python semantics) or not (Spark semantics). + +Possible values: + +- 0 - Don't include the remaining string (Spark semantics). +- 1 - Include the remaining string (Python semantics). + +Default value: `0`. + ## enable_extended_results_for_datetime_functions {#enable-extended-results-for-datetime-functions} Enables or disables returning results of type: diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index c88643ef7cf..7e788a8e45b 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -38,6 +38,8 @@ The behavior of parameter `max_substrings` changed starting with ClickHouse v22. For example, - in v22.10: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b','c=d']` - in v22.11: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b']` + +The previous behavior can be restored by setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) = 1. ::: **Example** @@ -80,6 +82,8 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere - There are multiple consecutive non-empty separators; - The original string `s` is empty while the separator is not empty. +Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. + **Example** ``` sql @@ -133,6 +137,8 @@ Returns an array of selected substrings. Empty substrings may be selected when: Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. + **Example** ``` sql @@ -182,6 +188,8 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. + **Example** ``` sql @@ -219,6 +227,8 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. + **Example** ``` sql @@ -279,6 +289,8 @@ Returns an array of selected substrings. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not. + **Example** ``` sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index e8430e96115..14e99918983 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -502,6 +502,7 @@ class IColumn; M(Bool, reject_expensive_hyperscan_regexps, true, "Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion)", 0) \ M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \ M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \ + M(Bool, split_tokens_like_python, false, "If true, then functions splitBy*() with given max_substring argument include remaining string in the result (Python semantics) or not (Spark semantics).", 0) \ \ M(Bool, allow_execute_multiif_columnar, true, "Allow execute multiIf function columnar", 0) \ M(Bool, formatdatetime_f_prints_single_zero, false, "Formatter '%f' in function 'formatDateTime()' produces a single zero instead of six zeros if the formatted value has no fractional seconds.", 0) \ diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp index 4d118481bb2..51b50d793e9 100644 --- a/src/Functions/FunctionsStringArray.cpp +++ b/src/Functions/FunctionsStringArray.cpp @@ -9,6 +9,40 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } +template +std::optional extractMaxSplitsImpl(const ColumnWithTypeAndName & argument) +{ + const auto * col = checkAndGetColumnConst>(argument.column.get()); + if (!col) + return std::nullopt; + + auto value = col->template getValue(); + return static_cast(value); +} + +std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position) +{ + if (max_substrings_argument_position >= arguments.size()) + return std::nullopt; + + std::optional max_splits; + if (!((max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) + || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) + || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) + || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])))) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {}, which is {}-th argument",// of function {}", + arguments[max_substrings_argument_position].column->getName(), + max_substrings_argument_position + 1);//, + /// getName()); + + if (max_splits && *max_splits <= 0) + return std::nullopt; + + return max_splits; +} + DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const { FunctionArgumentDescriptors mandatory_args{ diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index ce78090dc6b..92eb015e6e3 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -56,6 +56,13 @@ namespace ErrorCodes using Pos = const char *; +std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position); + +enum class SplitTokenMode +{ + LikeSpark, + LikePython +}; /// Substring generators. All of them have a common interface. @@ -64,6 +71,9 @@ class SplitByAlphaImpl private: Pos pos; Pos end; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; public: static constexpr auto name = "alphaTokens"; @@ -87,15 +97,19 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(1); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + { + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 1); + } /// Called for each next string. void set(Pos pos_, Pos end_) { pos = pos_; end = end_; + splits = 0; } /// Get the next token, if any, or return false. @@ -110,10 +124,26 @@ public: token_begin = pos; + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = end; + return true; + } + } + } + while (pos < end && isAlphaASCII(*pos)) ++pos; token_end = pos; + ++splits; return true; } @@ -124,6 +154,9 @@ class SplitByNonAlphaImpl private: Pos pos; Pos end; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; public: /// Get the name of the function. @@ -139,15 +172,19 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(1); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + { + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 1); + } /// Called for each next string. void set(Pos pos_, Pos end_) { pos = pos_; end = end_; + splits = 0; } /// Get the next token, if any, or return false. @@ -162,10 +199,25 @@ public: token_begin = pos; + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = end; + return true; + } + } + } while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) ++pos; token_end = pos; + splits++; return true; } @@ -176,6 +228,9 @@ class SplitByWhitespaceImpl private: Pos pos; Pos end; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; public: static constexpr auto name = "splitByWhitespace"; @@ -190,15 +245,19 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(1); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) + { + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 1); + } /// Called for each next string. void set(Pos pos_, Pos end_) { pos = pos_; end = end_; + splits = 0; } /// Get the next token, if any, or return false. @@ -213,10 +272,26 @@ public: token_begin = pos; + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = end; + return true; + } + } + } + while (pos < end && !isWhitespaceASCII(*pos)) ++pos; token_end = pos; + splits++; return true; } @@ -228,6 +303,9 @@ private: Pos pos; Pos end; char separator; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; public: static constexpr auto name = "splitByChar"; @@ -250,9 +328,8 @@ public: } static constexpr auto strings_argument_position = 1uz; - static constexpr auto max_substrings_argument_position = std::make_optional(2); - void init(const ColumnsWithTypeAndName & arguments) + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -266,12 +343,16 @@ public: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal separator for function {}. Must be exactly one byte.", getName()); separator = sep_str[0]; + + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 2); } void set(Pos pos_, Pos end_) { pos = pos_; end = end_; + splits = 0; } bool get(Pos & token_begin, Pos & token_end) @@ -280,12 +361,28 @@ public: return false; token_begin = pos; - pos = reinterpret_cast(memchr(pos, separator, end - pos)); + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = nullptr; + return true; + } + } + } + + pos = reinterpret_cast(memchr(pos, separator, end - pos)); if (pos) { token_end = pos; ++pos; + ++splits; } else token_end = end; @@ -300,8 +397,10 @@ class SplitByStringImpl private: Pos pos; Pos end; - String separator; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; public: static constexpr auto name = "splitByString"; @@ -315,9 +414,8 @@ public: } static constexpr auto strings_argument_position = 1uz; - static constexpr auto max_substrings_argument_position = std::make_optional(2); - void init(const ColumnsWithTypeAndName & arguments) + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -326,6 +424,9 @@ public: "Must be constant string.", arguments[0].column->getName(), getName()); separator = col->getValue(); + + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 2); } /// Called for each next string. @@ -333,6 +434,7 @@ public: { pos = pos_; end = end_; + splits = 0; } /// Get the next token, if any, or return false. @@ -344,8 +446,25 @@ public: return false; token_begin = pos; + + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = end; + return true; + } + } + } + pos += 1; token_end = pos; + ++splits; } else { @@ -354,8 +473,22 @@ public: token_begin = pos; - pos = reinterpret_cast(memmem(pos, end - pos, separator.data(), separator.size())); + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = nullptr; + return true; + } + } + } + pos = reinterpret_cast(memmem(pos, end - pos, separator.data(), separator.size())); if (pos) { token_end = pos; @@ -363,6 +496,7 @@ public: } else token_end = end; + ++splits; } return true; @@ -378,6 +512,10 @@ private: Pos pos; Pos end; + std::optional max_splits; + size_t splits; + SplitTokenMode split_token_mode; + public: static constexpr auto name = "splitByRegexp"; static String getName() { return name; } @@ -391,9 +529,8 @@ public: } static constexpr auto strings_argument_position = 1uz; - static constexpr auto max_substrings_argument_position = std::make_optional(2); - void init(const ColumnsWithTypeAndName & arguments) + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); @@ -403,6 +540,9 @@ public: if (!col->getValue().empty()) re = std::make_shared(Regexps::createRegexp(col->getValue())); + + split_token_mode = split_token_mode_; + max_splits = extractMaxSplits(arguments, 2); } /// Called for each next string. @@ -410,6 +550,7 @@ public: { pos = pos_; end = end_; + splits = 0; } /// Get the next token, if any, or return false. @@ -421,8 +562,25 @@ public: return false; token_begin = pos; + + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = end; + return true; + } + } + } + pos += 1; token_end = pos; + ++splits; } else { @@ -431,6 +589,21 @@ public: token_begin = pos; + if (max_splits && splits >= max_splits) + { + switch (split_token_mode) + { + case SplitTokenMode::LikeSpark: + return false; + case SplitTokenMode::LikePython: + { + token_end = end; + pos = nullptr; + return true; + } + } + } + if (!re->match(pos, end - pos, matches) || !matches[0].length) { token_end = end; @@ -441,6 +614,7 @@ public: token_end = pos + matches[0].offset; pos = token_end + matches[0].length; } + ++splits; } return true; @@ -473,9 +647,8 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(); - void init(const ColumnsWithTypeAndName & arguments) + void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode /*split_token_mode*/) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); @@ -527,9 +700,18 @@ public: template class FunctionTokens : public IFunction { +private: + SplitTokenMode split_token_mode; + public: static constexpr auto name = Generator::name; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + + explicit FunctionTokens(ContextPtr context) + { + const Settings & settings = context->getSettingsRef(); + split_token_mode = settings.split_tokens_like_python ? SplitTokenMode::LikePython : SplitTokenMode::LikeSpark; + } String getName() const override { return name; } @@ -549,14 +731,10 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override { Generator generator; - generator.init(arguments); + generator.init(arguments, split_token_mode); const auto & array_argument = arguments[generator.strings_argument_position]; - /// Whether we need to limit max tokens returned by Generator::get - /// If max_substrings is std::nullopt, no limit is applied. - auto max_substrings = getMaxSubstrings(arguments); - const ColumnString * col_str = checkAndGetColumn(array_argument.column.get()); const ColumnConst * col_str_const = checkAndGetColumnConstStringOrFixedString(array_argument.column.get()); @@ -592,7 +770,7 @@ public: generator.set(pos, end); size_t j = 0; - while (generator.get(token_begin, token_end) && !(max_substrings && j >= *max_substrings)) + while (generator.get(token_begin, token_end)) { size_t token_size = token_end - token_begin; @@ -620,7 +798,7 @@ public: Pos token_begin = nullptr; Pos token_end = nullptr; - while (generator.get(token_begin, token_end) && !(max_substrings && dst.size() >= *max_substrings)) + while (generator.get(token_begin, token_end)) dst.push_back(String(token_begin, token_end - token_begin)); return result_type->createColumnConst(col_str_const->size(), dst); @@ -629,47 +807,6 @@ public: throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns {}, {} of arguments of function {}", array_argument.column->getName(), array_argument.column->getName(), getName()); } - -private: - template - std::optional getMaxSubstringsImpl(const ColumnWithTypeAndName & argument) const - { - const auto * col = checkAndGetColumnConst>(argument.column.get()); - if (!col) - return {}; - - auto value = col->template getValue(); - return static_cast(value); - } - - std::optional getMaxSubstrings(const ColumnsWithTypeAndName & arguments) const - { - const auto pos = Generator::max_substrings_argument_position; - if (!pos) - return std::nullopt; - - if (*pos >= arguments.size()) - return std::nullopt; - - std::optional max_substrings; - if (!((max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) - || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) - || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) - || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])))) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {}, which is {}-th argument of function {}", - arguments[*pos].column->getName(), - *pos + 1, - getName()); - - /// If max_substrings is negative or zero, tokenize will be applied as many times as possible, which is equivalent to - /// no max_substrings argument in function - if (max_substrings && *max_substrings <= 0) - return std::nullopt; - - return max_substrings; - } }; diff --git a/src/Functions/URL/URLHierarchy.cpp b/src/Functions/URL/URLHierarchy.cpp index 69819d2214f..bce876f735f 100644 --- a/src/Functions/URL/URLHierarchy.cpp +++ b/src/Functions/URL/URLHierarchy.cpp @@ -33,9 +33,8 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/URLPathHierarchy.cpp b/src/Functions/URL/URLPathHierarchy.cpp index 2c4f4e9be5c..b44144a5358 100644 --- a/src/Functions/URL/URLPathHierarchy.cpp +++ b/src/Functions/URL/URLPathHierarchy.cpp @@ -32,9 +32,8 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/extractURLParameterNames.cpp b/src/Functions/URL/extractURLParameterNames.cpp index 0e9153acf7f..785ed050d15 100644 --- a/src/Functions/URL/extractURLParameterNames.cpp +++ b/src/Functions/URL/extractURLParameterNames.cpp @@ -32,9 +32,8 @@ public: } static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(); - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/src/Functions/URL/extractURLParameters.cpp b/src/Functions/URL/extractURLParameters.cpp index 273edde8d18..c21ced2a3aa 100644 --- a/src/Functions/URL/extractURLParameters.cpp +++ b/src/Functions/URL/extractURLParameters.cpp @@ -31,10 +31,9 @@ public: validateFunctionArgumentTypes(func, arguments, mandatory_args); } - void init(const ColumnsWithTypeAndName & /*arguments*/) {} + void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {} static constexpr auto strings_argument_position = 0uz; - static constexpr auto max_substrings_argument_position = std::make_optional(); /// Called for each next string. void set(Pos pos_, Pos end_) diff --git a/tests/queries/0_stateless/02876_splitby_spark_vs_python.reference b/tests/queries/0_stateless/02876_splitby_spark_vs_python.reference new file mode 100644 index 00000000000..0c73fd7de76 --- /dev/null +++ b/tests/queries/0_stateless/02876_splitby_spark_vs_python.reference @@ -0,0 +1,22 @@ +splitByAlpha +['ab','cd'] +['ab','cd','ef.gh'] +splitByNonAlpha +['128','0'] +['128','0','0.1'] +splitByWhitespace +['Nein,','nein,'] +['Nein,','nein,','nein! Doch!'] +splitByChar +['a','b'] +['a','b','c=d'] +splitByString +['a','='] +['a','=','=b==c==d'] +['a','b'] +['a','b','c==d'] +splitByRegexp +['a','1'] +['a','1','2bc23de345f'] +['a','bc'] +['a','bc','de345f'] diff --git a/tests/queries/0_stateless/02876_splitby_spark_vs_python.sql b/tests/queries/0_stateless/02876_splitby_spark_vs_python.sql new file mode 100644 index 00000000000..c550f69bd0c --- /dev/null +++ b/tests/queries/0_stateless/02876_splitby_spark_vs_python.sql @@ -0,0 +1,27 @@ +SELECT 'splitByAlpha'; +SELECT splitByAlpha('ab.cd.ef.gh', 2) settings split_tokens_like_python = 0; +SELECT splitByAlpha('ab.cd.ef.gh', 2) settings split_tokens_like_python = 1; + +SELECT 'splitByNonAlpha'; +SELECT splitByNonAlpha('128.0.0.1', 2) settings split_tokens_like_python = 0; +SELECT splitByNonAlpha('128.0.0.1', 2) settings split_tokens_like_python = 1; + +SELECT 'splitByWhitespace'; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) settings split_tokens_like_python = 0; +SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) settings split_tokens_like_python = 1; + +SELECT 'splitByChar'; +SELECT splitByChar('=', 'a=b=c=d', 2) SETTINGS split_tokens_like_python = 0; +SELECT splitByChar('=', 'a=b=c=d', 2) SETTINGS split_tokens_like_python = 1; + +SELECT 'splitByString'; +SELECT splitByString('', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 0; +SELECT splitByString('', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 1; +SELECT splitByString('==', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 0; +SELECT splitByString('==', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 1; + +SELECT 'splitByRegexp'; +SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 0; +SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 1; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 0; +SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 1;