mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 23:52:03 +00:00
improve doc and uts
This commit is contained in:
parent
a51a1b4394
commit
0df7e95845
@ -6,7 +6,7 @@ sidebar_label: Splitting and Merging Strings and Arrays
|
||||
|
||||
# Functions for Splitting and Merging Strings and Arrays
|
||||
|
||||
## splitByChar(separator, s)
|
||||
## splitByChar(separator, s[, max_substrings])
|
||||
|
||||
Splits a string into substrings separated by a specified character. It uses a constant string `separator` which consists of exactly one character.
|
||||
Returns an array of selected substrings. Empty substrings may be selected if the separator occurs at the beginning or end of the string, or if there are multiple consecutive separators.
|
||||
@ -14,13 +14,14 @@ Returns an array of selected substrings. Empty substrings may be selected if the
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
splitByChar(separator, s)
|
||||
splitByChar(separator, s[, max_substrings]))
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `separator` — The separator which should contain exactly one character. [String](../../sql-reference/data-types/string.md).
|
||||
- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
|
||||
- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
|
||||
|
||||
**Returned value(s)**
|
||||
|
||||
@ -44,20 +45,22 @@ SELECT splitByChar(',', '1,2,3,abcde');
|
||||
└─────────────────────────────────┘
|
||||
```
|
||||
|
||||
## splitByString(separator, s)
|
||||
## splitByString(separator, s[, max_substrings])
|
||||
|
||||
Splits a string into substrings separated by a string. It uses a constant string `separator` of multiple characters as the separator. If the string `separator` is empty, it will split the string `s` into an array of single characters.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
splitByString(separator, s)
|
||||
splitByString(separator, s[, max_substrings]))
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `separator` — The separator. [String](../../sql-reference/data-types/string.md).
|
||||
- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
|
||||
- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
|
||||
|
||||
|
||||
**Returned value(s)**
|
||||
|
||||
@ -91,20 +94,22 @@ SELECT splitByString('', 'abcde');
|
||||
└────────────────────────────┘
|
||||
```
|
||||
|
||||
## splitByRegexp(regexp, s)
|
||||
## splitByRegexp(regexp, s[, max_substrings])
|
||||
|
||||
Splits a string into substrings separated by a regular expression. It uses a regular expression string `regexp` as the separator. If the `regexp` is empty, it will split the string `s` into an array of single characters. If no match is found for this regular expression, the string `s` won't be split.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
splitByRegexp(regexp, s)
|
||||
splitByRegexp(regexp, s[, max_substrings]))
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `regexp` — Regular expression. Constant. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
|
||||
- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
|
||||
- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
|
||||
|
||||
|
||||
**Returned value(s)**
|
||||
|
||||
@ -146,7 +151,7 @@ Result:
|
||||
└────────────────────────────┘
|
||||
```
|
||||
|
||||
## splitByWhitespace(s)
|
||||
## splitByWhitespace(s[, max_substrings])
|
||||
|
||||
Splits a string into substrings separated by whitespace characters.
|
||||
Returns an array of selected substrings.
|
||||
@ -154,12 +159,14 @@ Returns an array of selected substrings.
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
splitByWhitespace(s)
|
||||
splitByWhitespace(s[, max_substrings]))
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
|
||||
- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
|
||||
|
||||
|
||||
**Returned value(s)**
|
||||
|
||||
@ -179,7 +186,7 @@ SELECT splitByWhitespace(' 1! a, b. ');
|
||||
└─────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## splitByNonAlpha(s)
|
||||
## splitByNonAlpha(s[, max_substrings])
|
||||
|
||||
Splits a string into substrings separated by whitespace and punctuation characters.
|
||||
Returns an array of selected substrings.
|
||||
@ -187,12 +194,14 @@ Returns an array of selected substrings.
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
splitByNonAlpha(s)
|
||||
splitByNonAlpha(s[, max_substrings]))
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
|
||||
- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
|
||||
|
||||
|
||||
**Returned value(s)**
|
||||
|
||||
@ -217,10 +226,28 @@ SELECT splitByNonAlpha(' 1! a, b. ');
|
||||
Concatenates string representations of values listed in the array with the separator. `separator` is an optional parameter: a constant string, set to an empty string by default.
|
||||
Returns the string.
|
||||
|
||||
## alphaTokens(s)
|
||||
## alphaTokens(s[, max_substrings]), splitByAlpha(s[, max_substrings])
|
||||
|
||||
Selects substrings of consecutive bytes from the ranges a-z and A-Z.Returns an array of substrings.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
alphaTokens(s[, max_substrings]))
|
||||
splitByAlpha(s[, max_substrings])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
|
||||
- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
|
||||
|
||||
**Returned value(s)**
|
||||
|
||||
Returns an array of selected substrings.
|
||||
|
||||
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
|
||||
|
||||
**Example**
|
||||
|
||||
``` sql
|
||||
|
@ -30,7 +30,9 @@ DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const DataTypes & argum
|
||||
REGISTER_FUNCTION(StringArray)
|
||||
{
|
||||
factory.registerFunction<FunctionExtractAll>();
|
||||
|
||||
factory.registerFunction<FunctionSplitByAlpha>();
|
||||
factory.registerAlias("splitByAlpha", FunctionSplitByAlpha::name);
|
||||
factory.registerFunction<FunctionSplitByNonAlpha>();
|
||||
factory.registerFunction<FunctionSplitByWhitespace>();
|
||||
factory.registerFunction<FunctionSplitByChar>();
|
||||
|
@ -32,12 +32,12 @@ namespace ErrorCodes
|
||||
|
||||
/** Functions that split strings into an array of strings or vice versa.
|
||||
*
|
||||
* splitByChar(sep, s)
|
||||
* splitByString(sep, s)
|
||||
* splitByRegexp(regexp, s)
|
||||
* splitByChar(sep, s[, max_substrings])
|
||||
* splitByString(sep, s[, max_substrings])
|
||||
* splitByRegexp(regexp, s[, max_substrings])
|
||||
*
|
||||
* splitByWhitespace(s) - split the string by whitespace characters
|
||||
* splitByNonAlpha(s) - split the string by whitespace and punctuation characters
|
||||
* splitByWhitespace(s[, max_substrings]) - split the string by whitespace characters
|
||||
* splitByNonAlpha(s[, max_substrings]) - split the string by whitespace and punctuation characters
|
||||
*
|
||||
* extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp.
|
||||
* - first subpattern, if regexp has subpattern;
|
||||
@ -48,7 +48,7 @@ namespace ErrorCodes
|
||||
* arrayStringConcat(arr, delimiter)
|
||||
* - join an array of strings into one string via a separator.
|
||||
*
|
||||
* alphaTokens(s) - select from the string subsequence `[a-zA-Z]+`.
|
||||
* alphaTokens(s[, max_substrings]) - select from the string subsequence `[a-zA-Z]+`.
|
||||
*
|
||||
* URL functions are located separately.
|
||||
*/
|
||||
@ -622,14 +622,13 @@ public:
|
||||
Generator::checkArguments(arguments);
|
||||
|
||||
const auto max_substrings_pos = Generator::getMaxSubstringsArgumentPosition();
|
||||
if (max_substrings_pos)
|
||||
if (arguments.size() > *max_substrings_pos && !isNativeInteger(arguments[*max_substrings_pos]))
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"{}-th argument for function '{}' must be integer, got '{}' instead",
|
||||
*max_substrings_pos + 1,
|
||||
getName(),
|
||||
arguments[*max_substrings_pos]->getName());
|
||||
if (max_substrings_pos && *max_substrings_pos < arguments.size() && !isNativeInteger(arguments[*max_substrings_pos]))
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"{}-th argument for function '{}' must be integer, got '{}' instead",
|
||||
*max_substrings_pos + 1,
|
||||
getName(),
|
||||
arguments[*max_substrings_pos]->getName());
|
||||
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
|
||||
}
|
||||
@ -734,16 +733,16 @@ private:
|
||||
{
|
||||
const auto pos = Generator::getMaxSubstringsArgumentPosition();
|
||||
if (!pos)
|
||||
return {};
|
||||
return std::nullopt;
|
||||
|
||||
if (arguments.size() <= *pos)
|
||||
return {};
|
||||
if (*pos >= arguments.size())
|
||||
return std::nullopt;
|
||||
|
||||
std::optional<Int64> max_substrings;
|
||||
if (!((max_substrings = getMaxSubstringsImpl<UInt8>(arguments[2])) || (max_substrings = getMaxSubstringsImpl<Int8>(arguments[2]))
|
||||
|| (max_substrings = getMaxSubstringsImpl<UInt16>(arguments[2])) || (max_substrings = getMaxSubstringsImpl<Int16>(arguments[2]))
|
||||
|| (max_substrings = getMaxSubstringsImpl<UInt32>(arguments[2])) || (max_substrings = getMaxSubstringsImpl<Int32>(arguments[2]))
|
||||
|| (max_substrings = getMaxSubstringsImpl<UInt64>(arguments[2])) || (max_substrings = getMaxSubstringsImpl<Int64>(arguments[2]))))
|
||||
if (!((max_substrings = getMaxSubstringsImpl<UInt8>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int8>(arguments[*pos]))
|
||||
|| (max_substrings = getMaxSubstringsImpl<UInt16>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int16>(arguments[*pos]))
|
||||
|| (max_substrings = getMaxSubstringsImpl<UInt32>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int32>(arguments[*pos]))
|
||||
|| (max_substrings = getMaxSubstringsImpl<UInt64>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int64>(arguments[*pos]))))
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Illegal column {}, which is {}-th argument of function {}",
|
||||
@ -754,7 +753,7 @@ private:
|
||||
/// If max_substrings is negative or zero, tokenize will be applied as many times as possible, which is equivalent to
|
||||
/// no max_substrings argument in function
|
||||
if (max_substrings && *max_substrings <= 0)
|
||||
return {};
|
||||
return std::nullopt;
|
||||
|
||||
return *max_substrings;
|
||||
}
|
||||
|
@ -5,11 +5,3 @@
|
||||
['gbye','bug']
|
||||
['']
|
||||
[]
|
||||
['one','two','three','']
|
||||
['one','two','three','']
|
||||
['one','two','three','']
|
||||
['one']
|
||||
['one','two']
|
||||
['one','two','three']
|
||||
['one','two','three','']
|
||||
['one','two','three','']
|
||||
|
@ -3,12 +3,3 @@ select splitByRegexp('', 'abcde');
|
||||
select splitByRegexp('<[^<>]*>', x) from (select arrayJoin(['<h1>hello<h2>world</h2></h1>', 'gbye<split>bug']) x);
|
||||
select splitByRegexp('ab', '');
|
||||
select splitByRegexp('', '');
|
||||
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC');
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', -1);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 0);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 1);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 2);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 3);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 4);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 5);
|
@ -1,10 +1 @@
|
||||
['1','2','3']
|
||||
['1']
|
||||
['1','2']
|
||||
['1','2','3']
|
||||
['1','2','3']
|
||||
['1','2','3']
|
||||
['1','2','3']
|
||||
['1','2','3']
|
||||
['expr1']
|
||||
['expr2']
|
||||
|
@ -1,12 +1 @@
|
||||
select splitByChar(',', '1,2,3');
|
||||
select splitByChar(',', '1,2,3', 1);
|
||||
select splitByChar(',', '1,2,3', 2);
|
||||
select splitByChar(',', '1,2,3', 3);
|
||||
select splitByChar(',', '1,2,3', 4);
|
||||
select splitByChar(',', '1,2,3', 0);
|
||||
select splitByChar(',', '1,2,3', -1);
|
||||
select splitByChar(',', '1,2,3', -2);
|
||||
SELECT splitByChar('=', s, 1) FROM values('s String', 'expr1=1+1=2', 'expr2=2+2=4=1+3');
|
||||
|
||||
select splitByChar(',', '1,2,3', ''); -- { serverError 43 }
|
||||
select splitByChar(',', '1,2,3', ''); -- { serverError 43 }
|
||||
|
@ -0,0 +1,44 @@
|
||||
['1','2','3']
|
||||
['1','2','3']
|
||||
['1','2','3']
|
||||
['1']
|
||||
['1','2']
|
||||
['1','2','3']
|
||||
['1','2','3']
|
||||
['one','two','three','']
|
||||
['one','two','three','']
|
||||
['one','two','three','']
|
||||
['one']
|
||||
['one','two']
|
||||
['one','two','three']
|
||||
['one','two','three','']
|
||||
['one','two','three','']
|
||||
['abca','abc']
|
||||
['abca','abc']
|
||||
['abca','abc']
|
||||
['abca']
|
||||
['abca','abc']
|
||||
['abca','abc']
|
||||
['abca','abc']
|
||||
['1','a','b']
|
||||
['1','a','b']
|
||||
['1','a','b']
|
||||
['1']
|
||||
['1','a']
|
||||
['1','a','b']
|
||||
['1','a','b']
|
||||
['1!','a,','b.']
|
||||
['1!','a,','b.']
|
||||
['1!','a,','b.']
|
||||
['1!']
|
||||
['1!','a,']
|
||||
['1!','a,','b.']
|
||||
['1!','a,','b.']
|
||||
['1','2 3','4,5','abcde']
|
||||
['1','2 3','4,5','abcde']
|
||||
['1','2 3','4,5','abcde']
|
||||
['1']
|
||||
['1','2 3']
|
||||
['1','2 3','4,5']
|
||||
['1','2 3','4,5','abcde']
|
||||
['1','2 3','4,5','abcde']
|
@ -0,0 +1,59 @@
|
||||
select splitByChar(',', '1,2,3');
|
||||
select splitByChar(',', '1,2,3', -1);
|
||||
select splitByChar(',', '1,2,3', 0);
|
||||
select splitByChar(',', '1,2,3', 1);
|
||||
select splitByChar(',', '1,2,3', 2);
|
||||
select splitByChar(',', '1,2,3', 3);
|
||||
select splitByChar(',', '1,2,3', 4);
|
||||
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC');
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', -1);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 0);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 1);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 2);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 3);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 4);
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 5);
|
||||
|
||||
SELECT alphaTokens('abca1abc');
|
||||
SELECT alphaTokens('abca1abc', -1);
|
||||
SELECT alphaTokens('abca1abc', 0);
|
||||
SELECT alphaTokens('abca1abc', 1);
|
||||
SELECT alphaTokens('abca1abc', 2);
|
||||
SELECT alphaTokens('abca1abc', 3);
|
||||
|
||||
SELECT splitByAlpha('abca1abc');
|
||||
|
||||
SELECT splitByNonAlpha(' 1! a, b. ');
|
||||
SELECT splitByNonAlpha(' 1! a, b. ', -1);
|
||||
SELECT splitByNonAlpha(' 1! a, b. ', 0);
|
||||
SELECT splitByNonAlpha(' 1! a, b. ', 1);
|
||||
SELECT splitByNonAlpha(' 1! a, b. ', 2);
|
||||
SELECT splitByNonAlpha(' 1! a, b. ', 3);
|
||||
SELECT splitByNonAlpha(' 1! a, b. ', 4);
|
||||
|
||||
SELECT splitByWhitespace(' 1! a, b. ');
|
||||
SELECT splitByWhitespace(' 1! a, b. ', -1);
|
||||
SELECT splitByWhitespace(' 1! a, b. ', 0);
|
||||
SELECT splitByWhitespace(' 1! a, b. ', 1);
|
||||
SELECT splitByWhitespace(' 1! a, b. ', 2);
|
||||
SELECT splitByWhitespace(' 1! a, b. ', 3);
|
||||
SELECT splitByWhitespace(' 1! a, b. ', 4);
|
||||
|
||||
SELECT splitByString(', ', '1, 2 3, 4,5, abcde');
|
||||
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', -1);
|
||||
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 0);
|
||||
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 1);
|
||||
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 2);
|
||||
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 3);
|
||||
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 4);
|
||||
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 5);
|
||||
|
||||
|
||||
select splitByChar(',', '1,2,3', ''); -- { serverError 43 }
|
||||
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', ''); -- { serverError 43 }
|
||||
SELECT alphaTokens('abca1abc', ''); -- { serverError 43 }
|
||||
SELECT splitByAlpha('abca1abc', ''); -- { serverError 43 }
|
||||
SELECT splitByNonAlpha(' 1! a, b. ', ''); -- { serverError 43 }
|
||||
SELECT splitByWhitespace(' 1! a, b. ', ''); -- { serverError 43 }
|
||||
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', ''); -- { serverError 43 }
|
Loading…
Reference in New Issue
Block a user