improve doc and uts

2024-11-22 23:52:03 +00:00 · 2022-11-03 16:12:19 +08:00 · 2022-11-03 16:12:19 +08:00 · 0df7e95845
commit 0df7e95845
parent a51a1b4394
9 changed files with 164 additions and 70 deletions
--- a/docs/en/sql-reference/functions/splitting-merging-functions.md
+++ b/docs/en/sql-reference/functions/splitting-merging-functions.md
@ -6,7 +6,7 @@ sidebar_label: Splitting and Merging Strings and Arrays

 # Functions for Splitting and Merging Strings and Arrays

-## splitByChar(separator, s)
+## splitByChar(separator, s[, max_substrings])

 Splits a string into substrings separated by a specified character. It uses a constant string `separator` which consists of exactly one character.
 Returns an array of selected substrings. Empty substrings may be selected if the separator occurs at the beginning or end of the string, or if there are multiple consecutive separators.
@ -14,13 +14,14 @@ Returns an array of selected substrings. Empty substrings may be selected if the
 **Syntax**

 ``` sql
-splitByChar(separator, s)
+splitByChar(separator, s[, max_substrings]))
 ```

 **Arguments**

 -   `separator` — The separator which should contain exactly one character. [String](../../sql-reference/data-types/string.md).
 -   `s` — The string to split. [String](../../sql-reference/data-types/string.md).
+-   `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.

 **Returned value(s)**

@ -44,20 +45,22 @@ SELECT splitByChar(',', '1,2,3,abcde');
 └─────────────────────────────────┘
 ```

-## splitByString(separator, s)
+## splitByString(separator, s[, max_substrings])

 Splits a string into substrings separated by a string. It uses a constant string `separator` of multiple characters as the separator. If the string `separator` is empty, it will split the string `s` into an array of single characters.

 **Syntax**

 ``` sql
-splitByString(separator, s)
+splitByString(separator, s[, max_substrings]))
 ```

 **Arguments**

 -   `separator` — The separator. [String](../../sql-reference/data-types/string.md).
 -   `s` — The string to split. [String](../../sql-reference/data-types/string.md).
+-   `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
+

 **Returned value(s)**

@ -91,20 +94,22 @@ SELECT splitByString('', 'abcde');
 └────────────────────────────┘
 ```

-## splitByRegexp(regexp, s)
+## splitByRegexp(regexp, s[, max_substrings])

 Splits a string into substrings separated by a regular expression. It uses a regular expression string `regexp` as the separator. If the `regexp` is empty, it will split the string `s` into an array of single characters. If no match is found for this regular expression, the string `s` won't be split.

 **Syntax**

 ``` sql
-splitByRegexp(regexp, s)
+splitByRegexp(regexp, s[, max_substrings]))
 ```

 **Arguments**

 -   `regexp` — Regular expression. Constant. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
 -   `s` — The string to split. [String](../../sql-reference/data-types/string.md).
+-   `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
+

 **Returned value(s)**

@ -146,7 +151,7 @@ Result:
 └────────────────────────────┘
 ```

-## splitByWhitespace(s)
+## splitByWhitespace(s[, max_substrings])

 Splits a string into substrings separated by whitespace characters. 
 Returns an array of selected substrings.
@ -154,12 +159,14 @@ Returns an array of selected substrings.
 **Syntax**

 ``` sql
-splitByWhitespace(s)
+splitByWhitespace(s[, max_substrings]))
 ```

 **Arguments**

 -   `s` — The string to split. [String](../../sql-reference/data-types/string.md).
+-   `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
+

 **Returned value(s)**

@ -179,7 +186,7 @@ SELECT splitByWhitespace('  1!  a,  b.  ');
 └─────────────────────────────────────┘
 ```

-## splitByNonAlpha(s)
+## splitByNonAlpha(s[, max_substrings])

 Splits a string into substrings separated by whitespace and punctuation characters. 
 Returns an array of selected substrings.
@ -187,12 +194,14 @@ Returns an array of selected substrings.
 **Syntax**

 ``` sql
-splitByNonAlpha(s)
+splitByNonAlpha(s[, max_substrings]))
 ```

 **Arguments**

 -   `s` — The string to split. [String](../../sql-reference/data-types/string.md).
+-   `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
+

 **Returned value(s)**

@ -217,10 +226,28 @@ SELECT splitByNonAlpha('  1!  a,  b.  ');
 Concatenates string representations of values listed in the array with the separator. `separator` is an optional parameter: a constant string, set to an empty string by default.
 Returns the string.

-## alphaTokens(s)
+## alphaTokens(s[, max_substrings]), splitByAlpha(s[, max_substrings])

 Selects substrings of consecutive bytes from the ranges a-z and A-Z.Returns an array of substrings.

+**Syntax**
+
+``` sql
+alphaTokens(s[, max_substrings]))
+splitByAlpha(s[, max_substrings])
+```
+
+**Arguments**
+
+-   `s` — The string to split. [String](../../sql-reference/data-types/string.md).
+-   `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
+
+**Returned value(s)**
+
+Returns an array of selected substrings.
+
+Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
+
 **Example**

 ``` sql
--- a/src/Functions/FunctionsStringArray.cpp
+++ b/src/Functions/FunctionsStringArray.cpp
@ -30,7 +30,9 @@ DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const DataTypes & argum
 REGISTER_FUNCTION(StringArray)
 {
    factory.registerFunction<FunctionExtractAll>();
+
    factory.registerFunction<FunctionSplitByAlpha>();
+    factory.registerAlias("splitByAlpha", FunctionSplitByAlpha::name);
    factory.registerFunction<FunctionSplitByNonAlpha>();
    factory.registerFunction<FunctionSplitByWhitespace>();
    factory.registerFunction<FunctionSplitByChar>();
--- a/src/Functions/FunctionsStringArray.h
+++ b/src/Functions/FunctionsStringArray.h
@ -32,12 +32,12 @@ namespace ErrorCodes

 /** Functions that split strings into an array of strings or vice versa.
  *
-  * splitByChar(sep, s)
-  * splitByString(sep, s)
-  * splitByRegexp(regexp, s)
+  * splitByChar(sep, s[, max_substrings])
+  * splitByString(sep, s[, max_substrings])
+  * splitByRegexp(regexp, s[, max_substrings])
  *
-  * splitByWhitespace(s)      - split the string by whitespace characters
-  * splitByNonAlpha(s)        - split the string by whitespace and punctuation characters
+  * splitByWhitespace(s[, max_substrings])      - split the string by whitespace characters
+  * splitByNonAlpha(s[, max_substrings])        - split the string by whitespace and punctuation characters
  *
  * extractAll(s, regexp)     - select from the string the subsequences corresponding to the regexp.
  * - first subpattern, if regexp has subpattern;
@ -48,7 +48,7 @@ namespace ErrorCodes
  * arrayStringConcat(arr, delimiter)
  * - join an array of strings into one string via a separator.
  *
-  * alphaTokens(s)            - select from the string subsequence `[a-zA-Z]+`.
+  * alphaTokens(s[, max_substrings])            - select from the string subsequence `[a-zA-Z]+`.
  *
  * URL functions are located separately.
  */
@ -622,14 +622,13 @@ public:
        Generator::checkArguments(arguments);

        const auto max_substrings_pos = Generator::getMaxSubstringsArgumentPosition();
-        if (max_substrings_pos)
-            if (arguments.size() > *max_substrings_pos && !isNativeInteger(arguments[*max_substrings_pos]))
-                throw Exception(
-                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                    "{}-th argument for function '{}' must be integer, got '{}' instead",
-                    *max_substrings_pos + 1,
-                    getName(),
-                    arguments[*max_substrings_pos]->getName());
+        if (max_substrings_pos && *max_substrings_pos < arguments.size() && !isNativeInteger(arguments[*max_substrings_pos]))
+            throw Exception(
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                "{}-th argument for function '{}' must be integer, got '{}' instead",
+                *max_substrings_pos + 1,
+                getName(),
+                arguments[*max_substrings_pos]->getName());

        return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
    }
@ -734,16 +733,16 @@ private:
    {
        const auto pos = Generator::getMaxSubstringsArgumentPosition();
        if (!pos)
-            return {};
+            return std::nullopt;

-        if (arguments.size() <= *pos)
-            return {};
+        if (*pos >= arguments.size())
+            return std::nullopt;

        std::optional<Int64> max_substrings;
-        if (!((max_substrings = getMaxSubstringsImpl<UInt8>(arguments[2])) || (max_substrings = getMaxSubstringsImpl<Int8>(arguments[2]))
-              || (max_substrings = getMaxSubstringsImpl<UInt16>(arguments[2])) || (max_substrings = getMaxSubstringsImpl<Int16>(arguments[2]))
-              || (max_substrings = getMaxSubstringsImpl<UInt32>(arguments[2])) || (max_substrings = getMaxSubstringsImpl<Int32>(arguments[2]))
-              || (max_substrings = getMaxSubstringsImpl<UInt64>(arguments[2])) || (max_substrings = getMaxSubstringsImpl<Int64>(arguments[2]))))
+        if (!((max_substrings = getMaxSubstringsImpl<UInt8>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int8>(arguments[*pos]))
+              || (max_substrings = getMaxSubstringsImpl<UInt16>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int16>(arguments[*pos]))
+              || (max_substrings = getMaxSubstringsImpl<UInt32>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int32>(arguments[*pos]))
+              || (max_substrings = getMaxSubstringsImpl<UInt64>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int64>(arguments[*pos]))))
            throw Exception(
                ErrorCodes::ILLEGAL_COLUMN,
                "Illegal column {}, which is {}-th argument of function {}",
@ -754,7 +753,7 @@ private:
        /// If max_substrings is negative or zero, tokenize will be applied as many times as possible, which is equivalent to
        /// no max_substrings argument in function
        if (max_substrings && *max_substrings <= 0)
-            return {};
+            return std::nullopt;

        return *max_substrings;
    }
--- a/tests/queries/0_stateless/01866_split_by_regexp.reference
+++ b/tests/queries/0_stateless/01866_split_by_regexp.reference
@ -5,11 +5,3 @@
 ['gbye','bug']
 ['']
 []
-['one','two','three','']
-['one','two','three','']
-['one','two','three','']
-['one']
-['one','two']
-['one','two','three']
-['one','two','three','']
-['one','two','three','']
--- a/tests/queries/0_stateless/01866_split_by_regexp.sql
+++ b/tests/queries/0_stateless/01866_split_by_regexp.sql
@ -3,12 +3,3 @@ select splitByRegexp('', 'abcde');
 select splitByRegexp('<[^<>]*>', x) from (select arrayJoin(['<h1>hello<h2>world</h2></h1>', 'gbye<split>bug']) x);
 select splitByRegexp('ab', '');
 select splitByRegexp('', '');
-
-select splitByRegexp('[ABC]', 'oneAtwoBthreeC');
-select splitByRegexp('[ABC]', 'oneAtwoBthreeC', -1);
-select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 0);
-select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 1);
-select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 2);
-select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 3);
-select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 4);
-select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 5);
--- a/tests/queries/0_stateless/02185_split_by_char.reference
+++ b/tests/queries/0_stateless/02185_split_by_char.reference
@ -1,10 +1 @@
 ['1','2','3']
-['1']
-['1','2']
-['1','2','3']
-['1','2','3']
-['1','2','3']
-['1','2','3']
-['1','2','3']
-['expr1']
-['expr2']
--- a/tests/queries/0_stateless/02185_split_by_char.sql
+++ b/tests/queries/0_stateless/02185_split_by_char.sql
@ -1,12 +1 @@
 select splitByChar(',', '1,2,3');
-select splitByChar(',', '1,2,3', 1);
-select splitByChar(',', '1,2,3', 2);
-select splitByChar(',', '1,2,3', 3);
-select splitByChar(',', '1,2,3', 4);
-select splitByChar(',', '1,2,3', 0);
-select splitByChar(',', '1,2,3', -1);
-select splitByChar(',', '1,2,3', -2);
-SELECT splitByChar('=', s, 1) FROM values('s String', 'expr1=1+1=2', 'expr2=2+2=4=1+3');
-
-select splitByChar(',', '1,2,3', ''); -- { serverError 43 }
-select splitByChar(',', '1,2,3', ''); -- { serverError 43 }
--- a/tests/queries/0_stateless/02475_split_with_max_substrings.reference
+++ b/tests/queries/0_stateless/02475_split_with_max_substrings.reference
@ -0,0 +1,44 @@
+['1','2','3']
+['1','2','3']
+['1','2','3']
+['1']
+['1','2']
+['1','2','3']
+['1','2','3']
+['one','two','three','']
+['one','two','three','']
+['one','two','three','']
+['one']
+['one','two']
+['one','two','three']
+['one','two','three','']
+['one','two','three','']
+['abca','abc']
+['abca','abc']
+['abca','abc']
+['abca']
+['abca','abc']
+['abca','abc']
+['abca','abc']
+['1','a','b']
+['1','a','b']
+['1','a','b']
+['1']
+['1','a']
+['1','a','b']
+['1','a','b']
+['1!','a,','b.']
+['1!','a,','b.']
+['1!','a,','b.']
+['1!']
+['1!','a,']
+['1!','a,','b.']
+['1!','a,','b.']
+['1','2 3','4,5','abcde']
+['1','2 3','4,5','abcde']
+['1','2 3','4,5','abcde']
+['1']
+['1','2 3']
+['1','2 3','4,5']
+['1','2 3','4,5','abcde']
+['1','2 3','4,5','abcde']
--- a/tests/queries/0_stateless/02475_split_with_max_substrings.sql
+++ b/tests/queries/0_stateless/02475_split_with_max_substrings.sql
@ -0,0 +1,59 @@
+select splitByChar(',', '1,2,3');
+select splitByChar(',', '1,2,3', -1);
+select splitByChar(',', '1,2,3', 0);
+select splitByChar(',', '1,2,3', 1);
+select splitByChar(',', '1,2,3', 2);
+select splitByChar(',', '1,2,3', 3);
+select splitByChar(',', '1,2,3', 4);
+
+select splitByRegexp('[ABC]', 'oneAtwoBthreeC');
+select splitByRegexp('[ABC]', 'oneAtwoBthreeC', -1);
+select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 0);
+select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 1);
+select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 2);
+select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 3);
+select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 4);
+select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 5);
+
+SELECT alphaTokens('abca1abc');
+SELECT alphaTokens('abca1abc', -1);
+SELECT alphaTokens('abca1abc', 0);
+SELECT alphaTokens('abca1abc', 1);
+SELECT alphaTokens('abca1abc', 2);
+SELECT alphaTokens('abca1abc', 3);
+
+SELECT splitByAlpha('abca1abc');
+
+SELECT splitByNonAlpha('  1!  a,  b.  ');
+SELECT splitByNonAlpha('  1!  a,  b.  ', -1);
+SELECT splitByNonAlpha('  1!  a,  b.  ',  0);
+SELECT splitByNonAlpha('  1!  a,  b.  ',  1);
+SELECT splitByNonAlpha('  1!  a,  b.  ',  2);
+SELECT splitByNonAlpha('  1!  a,  b.  ',  3);
+SELECT splitByNonAlpha('  1!  a,  b.  ',  4);
+
+SELECT splitByWhitespace('  1!  a,  b.  ');
+SELECT splitByWhitespace('  1!  a,  b.  ', -1);
+SELECT splitByWhitespace('  1!  a,  b.  ', 0);
+SELECT splitByWhitespace('  1!  a,  b.  ', 1);
+SELECT splitByWhitespace('  1!  a,  b.  ', 2);
+SELECT splitByWhitespace('  1!  a,  b.  ', 3);
+SELECT splitByWhitespace('  1!  a,  b.  ', 4);
+
+SELECT splitByString(', ', '1, 2 3, 4,5, abcde');
+SELECT splitByString(', ', '1, 2 3, 4,5, abcde', -1);
+SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 0);
+SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 1);
+SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 2);
+SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 3);
+SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 4);
+SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 5);
+
+
+select splitByChar(',', '1,2,3', ''); -- { serverError 43 }
+select splitByRegexp('[ABC]', 'oneAtwoBthreeC', ''); -- { serverError 43 }
+SELECT alphaTokens('abca1abc', ''); -- { serverError 43 }
+SELECT splitByAlpha('abca1abc', ''); -- { serverError 43 }
+SELECT splitByNonAlpha('  1!  a,  b.  ',  ''); -- { serverError 43 }
+SELECT splitByWhitespace('  1!  a,  b.  ', ''); -- { serverError 43 }
+SELECT splitByString(', ', '1, 2 3, 4,5, abcde', ''); -- { serverError 43 }