diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index 1bf5839bf9f..7cad6b2fbbf 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -6,7 +6,7 @@ sidebar_label: Splitting and Merging Strings and Arrays # Functions for Splitting and Merging Strings and Arrays -## splitByChar(separator, s) +## splitByChar(separator, s[, max_substrings]) Splits a string into substrings separated by a specified character. It uses a constant string `separator` which consists of exactly one character. Returns an array of selected substrings. Empty substrings may be selected if the separator occurs at the beginning or end of the string, or if there are multiple consecutive separators. @@ -14,13 +14,14 @@ Returns an array of selected substrings. Empty substrings may be selected if the **Syntax** ``` sql -splitByChar(separator, s) +splitByChar(separator, s[, max_substrings])) ``` **Arguments** - `separator` — The separator which should contain exactly one character. [String](../../sql-reference/data-types/string.md). - `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. **Returned value(s)** @@ -44,20 +45,22 @@ SELECT splitByChar(',', '1,2,3,abcde'); └─────────────────────────────────┘ ``` -## splitByString(separator, s) +## splitByString(separator, s[, max_substrings]) Splits a string into substrings separated by a string. It uses a constant string `separator` of multiple characters as the separator. If the string `separator` is empty, it will split the string `s` into an array of single characters. **Syntax** ``` sql -splitByString(separator, s) +splitByString(separator, s[, max_substrings])) ``` **Arguments** - `separator` — The separator. [String](../../sql-reference/data-types/string.md). - `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. + **Returned value(s)** @@ -91,20 +94,22 @@ SELECT splitByString('', 'abcde'); └────────────────────────────┘ ``` -## splitByRegexp(regexp, s) +## splitByRegexp(regexp, s[, max_substrings]) Splits a string into substrings separated by a regular expression. It uses a regular expression string `regexp` as the separator. If the `regexp` is empty, it will split the string `s` into an array of single characters. If no match is found for this regular expression, the string `s` won't be split. **Syntax** ``` sql -splitByRegexp(regexp, s) +splitByRegexp(regexp, s[, max_substrings])) ``` **Arguments** - `regexp` — Regular expression. Constant. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). - `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. + **Returned value(s)** @@ -146,7 +151,7 @@ Result: └────────────────────────────┘ ``` -## splitByWhitespace(s) +## splitByWhitespace(s[, max_substrings]) Splits a string into substrings separated by whitespace characters. Returns an array of selected substrings. @@ -154,12 +159,14 @@ Returns an array of selected substrings. **Syntax** ``` sql -splitByWhitespace(s) +splitByWhitespace(s[, max_substrings])) ``` **Arguments** - `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. + **Returned value(s)** @@ -179,7 +186,7 @@ SELECT splitByWhitespace(' 1! a, b. '); └─────────────────────────────────────┘ ``` -## splitByNonAlpha(s) +## splitByNonAlpha(s[, max_substrings]) Splits a string into substrings separated by whitespace and punctuation characters. Returns an array of selected substrings. @@ -187,12 +194,14 @@ Returns an array of selected substrings. **Syntax** ``` sql -splitByNonAlpha(s) +splitByNonAlpha(s[, max_substrings])) ``` **Arguments** - `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. + **Returned value(s)** @@ -217,10 +226,28 @@ SELECT splitByNonAlpha(' 1! a, b. '); Concatenates string representations of values listed in the array with the separator. `separator` is an optional parameter: a constant string, set to an empty string by default. Returns the string. -## alphaTokens(s) +## alphaTokens(s[, max_substrings]), splitByAlpha(s[, max_substrings]) Selects substrings of consecutive bytes from the ranges a-z and A-Z.Returns an array of substrings. +**Syntax** + +``` sql +alphaTokens(s[, max_substrings])) +splitByAlpha(s[, max_substrings]) +``` + +**Arguments** + +- `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. + +**Returned value(s)** + +Returns an array of selected substrings. + +Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). + **Example** ``` sql diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp index fce87ed9cb3..06f58730e41 100644 --- a/src/Functions/FunctionsStringArray.cpp +++ b/src/Functions/FunctionsStringArray.cpp @@ -30,7 +30,9 @@ DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const DataTypes & argum REGISTER_FUNCTION(StringArray) { factory.registerFunction(); + factory.registerFunction(); + factory.registerAlias("splitByAlpha", FunctionSplitByAlpha::name); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index f1805a76586..ef0d9ac2896 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -32,12 +32,12 @@ namespace ErrorCodes /** Functions that split strings into an array of strings or vice versa. * - * splitByChar(sep, s) - * splitByString(sep, s) - * splitByRegexp(regexp, s) + * splitByChar(sep, s[, max_substrings]) + * splitByString(sep, s[, max_substrings]) + * splitByRegexp(regexp, s[, max_substrings]) * - * splitByWhitespace(s) - split the string by whitespace characters - * splitByNonAlpha(s) - split the string by whitespace and punctuation characters + * splitByWhitespace(s[, max_substrings]) - split the string by whitespace characters + * splitByNonAlpha(s[, max_substrings]) - split the string by whitespace and punctuation characters * * extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp. * - first subpattern, if regexp has subpattern; @@ -48,7 +48,7 @@ namespace ErrorCodes * arrayStringConcat(arr, delimiter) * - join an array of strings into one string via a separator. * - * alphaTokens(s) - select from the string subsequence `[a-zA-Z]+`. + * alphaTokens(s[, max_substrings]) - select from the string subsequence `[a-zA-Z]+`. * * URL functions are located separately. */ @@ -622,14 +622,13 @@ public: Generator::checkArguments(arguments); const auto max_substrings_pos = Generator::getMaxSubstringsArgumentPosition(); - if (max_substrings_pos) - if (arguments.size() > *max_substrings_pos && !isNativeInteger(arguments[*max_substrings_pos])) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "{}-th argument for function '{}' must be integer, got '{}' instead", - *max_substrings_pos + 1, - getName(), - arguments[*max_substrings_pos]->getName()); + if (max_substrings_pos && *max_substrings_pos < arguments.size() && !isNativeInteger(arguments[*max_substrings_pos])) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "{}-th argument for function '{}' must be integer, got '{}' instead", + *max_substrings_pos + 1, + getName(), + arguments[*max_substrings_pos]->getName()); return std::make_shared(std::make_shared()); } @@ -734,16 +733,16 @@ private: { const auto pos = Generator::getMaxSubstringsArgumentPosition(); if (!pos) - return {}; + return std::nullopt; - if (arguments.size() <= *pos) - return {}; + if (*pos >= arguments.size()) + return std::nullopt; std::optional max_substrings; - if (!((max_substrings = getMaxSubstringsImpl(arguments[2])) || (max_substrings = getMaxSubstringsImpl(arguments[2])) - || (max_substrings = getMaxSubstringsImpl(arguments[2])) || (max_substrings = getMaxSubstringsImpl(arguments[2])) - || (max_substrings = getMaxSubstringsImpl(arguments[2])) || (max_substrings = getMaxSubstringsImpl(arguments[2])) - || (max_substrings = getMaxSubstringsImpl(arguments[2])) || (max_substrings = getMaxSubstringsImpl(arguments[2])))) + if (!((max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) + || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) + || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) + || (max_substrings = getMaxSubstringsImpl(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl(arguments[*pos])))) throw Exception( ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}, which is {}-th argument of function {}", @@ -754,7 +753,7 @@ private: /// If max_substrings is negative or zero, tokenize will be applied as many times as possible, which is equivalent to /// no max_substrings argument in function if (max_substrings && *max_substrings <= 0) - return {}; + return std::nullopt; return *max_substrings; } diff --git a/tests/queries/0_stateless/01866_split_by_regexp.reference b/tests/queries/0_stateless/01866_split_by_regexp.reference index ce33c1dd63a..a3ae2f35a5f 100644 --- a/tests/queries/0_stateless/01866_split_by_regexp.reference +++ b/tests/queries/0_stateless/01866_split_by_regexp.reference @@ -5,11 +5,3 @@ ['gbye','bug'] [''] [] -['one','two','three',''] -['one','two','three',''] -['one','two','three',''] -['one'] -['one','two'] -['one','two','three'] -['one','two','three',''] -['one','two','three',''] diff --git a/tests/queries/0_stateless/01866_split_by_regexp.sql b/tests/queries/0_stateless/01866_split_by_regexp.sql index 8553c962038..e472fb68d94 100644 --- a/tests/queries/0_stateless/01866_split_by_regexp.sql +++ b/tests/queries/0_stateless/01866_split_by_regexp.sql @@ -3,12 +3,3 @@ select splitByRegexp('', 'abcde'); select splitByRegexp('<[^<>]*>', x) from (select arrayJoin(['

hello

world

', 'gbyebug']) x); select splitByRegexp('ab', ''); select splitByRegexp('', ''); - -select splitByRegexp('[ABC]', 'oneAtwoBthreeC'); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', -1); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 0); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 1); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 2); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 3); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 4); -select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 5); \ No newline at end of file diff --git a/tests/queries/0_stateless/02185_split_by_char.reference b/tests/queries/0_stateless/02185_split_by_char.reference index d8dd65ed63d..e831ed938fe 100644 --- a/tests/queries/0_stateless/02185_split_by_char.reference +++ b/tests/queries/0_stateless/02185_split_by_char.reference @@ -1,10 +1 @@ ['1','2','3'] -['1'] -['1','2'] -['1','2','3'] -['1','2','3'] -['1','2','3'] -['1','2','3'] -['1','2','3'] -['expr1'] -['expr2'] diff --git a/tests/queries/0_stateless/02185_split_by_char.sql b/tests/queries/0_stateless/02185_split_by_char.sql index 6ab1fc19079..127e2f631e5 100644 --- a/tests/queries/0_stateless/02185_split_by_char.sql +++ b/tests/queries/0_stateless/02185_split_by_char.sql @@ -1,12 +1 @@ select splitByChar(',', '1,2,3'); -select splitByChar(',', '1,2,3', 1); -select splitByChar(',', '1,2,3', 2); -select splitByChar(',', '1,2,3', 3); -select splitByChar(',', '1,2,3', 4); -select splitByChar(',', '1,2,3', 0); -select splitByChar(',', '1,2,3', -1); -select splitByChar(',', '1,2,3', -2); -SELECT splitByChar('=', s, 1) FROM values('s String', 'expr1=1+1=2', 'expr2=2+2=4=1+3'); - -select splitByChar(',', '1,2,3', ''); -- { serverError 43 } -select splitByChar(',', '1,2,3', ''); -- { serverError 43 } diff --git a/tests/queries/0_stateless/02475_split_with_max_substrings.reference b/tests/queries/0_stateless/02475_split_with_max_substrings.reference new file mode 100644 index 00000000000..d55ef45a5e0 --- /dev/null +++ b/tests/queries/0_stateless/02475_split_with_max_substrings.reference @@ -0,0 +1,44 @@ +['1','2','3'] +['1','2','3'] +['1','2','3'] +['1'] +['1','2'] +['1','2','3'] +['1','2','3'] +['one','two','three',''] +['one','two','three',''] +['one','two','three',''] +['one'] +['one','two'] +['one','two','three'] +['one','two','three',''] +['one','two','three',''] +['abca','abc'] +['abca','abc'] +['abca','abc'] +['abca'] +['abca','abc'] +['abca','abc'] +['abca','abc'] +['1','a','b'] +['1','a','b'] +['1','a','b'] +['1'] +['1','a'] +['1','a','b'] +['1','a','b'] +['1!','a,','b.'] +['1!','a,','b.'] +['1!','a,','b.'] +['1!'] +['1!','a,'] +['1!','a,','b.'] +['1!','a,','b.'] +['1','2 3','4,5','abcde'] +['1','2 3','4,5','abcde'] +['1','2 3','4,5','abcde'] +['1'] +['1','2 3'] +['1','2 3','4,5'] +['1','2 3','4,5','abcde'] +['1','2 3','4,5','abcde'] diff --git a/tests/queries/0_stateless/02475_split_with_max_substrings.sql b/tests/queries/0_stateless/02475_split_with_max_substrings.sql new file mode 100644 index 00000000000..c51133c604e --- /dev/null +++ b/tests/queries/0_stateless/02475_split_with_max_substrings.sql @@ -0,0 +1,59 @@ +select splitByChar(',', '1,2,3'); +select splitByChar(',', '1,2,3', -1); +select splitByChar(',', '1,2,3', 0); +select splitByChar(',', '1,2,3', 1); +select splitByChar(',', '1,2,3', 2); +select splitByChar(',', '1,2,3', 3); +select splitByChar(',', '1,2,3', 4); + +select splitByRegexp('[ABC]', 'oneAtwoBthreeC'); +select splitByRegexp('[ABC]', 'oneAtwoBthreeC', -1); +select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 0); +select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 1); +select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 2); +select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 3); +select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 4); +select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 5); + +SELECT alphaTokens('abca1abc'); +SELECT alphaTokens('abca1abc', -1); +SELECT alphaTokens('abca1abc', 0); +SELECT alphaTokens('abca1abc', 1); +SELECT alphaTokens('abca1abc', 2); +SELECT alphaTokens('abca1abc', 3); + +SELECT splitByAlpha('abca1abc'); + +SELECT splitByNonAlpha(' 1! a, b. '); +SELECT splitByNonAlpha(' 1! a, b. ', -1); +SELECT splitByNonAlpha(' 1! a, b. ', 0); +SELECT splitByNonAlpha(' 1! a, b. ', 1); +SELECT splitByNonAlpha(' 1! a, b. ', 2); +SELECT splitByNonAlpha(' 1! a, b. ', 3); +SELECT splitByNonAlpha(' 1! a, b. ', 4); + +SELECT splitByWhitespace(' 1! a, b. '); +SELECT splitByWhitespace(' 1! a, b. ', -1); +SELECT splitByWhitespace(' 1! a, b. ', 0); +SELECT splitByWhitespace(' 1! a, b. ', 1); +SELECT splitByWhitespace(' 1! a, b. ', 2); +SELECT splitByWhitespace(' 1! a, b. ', 3); +SELECT splitByWhitespace(' 1! a, b. ', 4); + +SELECT splitByString(', ', '1, 2 3, 4,5, abcde'); +SELECT splitByString(', ', '1, 2 3, 4,5, abcde', -1); +SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 0); +SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 1); +SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 2); +SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 3); +SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 4); +SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 5); + + +select splitByChar(',', '1,2,3', ''); -- { serverError 43 } +select splitByRegexp('[ABC]', 'oneAtwoBthreeC', ''); -- { serverError 43 } +SELECT alphaTokens('abca1abc', ''); -- { serverError 43 } +SELECT splitByAlpha('abca1abc', ''); -- { serverError 43 } +SELECT splitByNonAlpha(' 1! a, b. ', ''); -- { serverError 43 } +SELECT splitByWhitespace(' 1! a, b. ', ''); -- { serverError 43 } +SELECT splitByString(', ', '1, 2 3, 4,5, abcde', ''); -- { serverError 43 } \ No newline at end of file