Merge pull request #54518 from rschu1ze/split-better

Provide fallback to Python/Spark-like splitting in splitBy*() functions
This commit is contained in:
Robert Schulze 2023-09-22 12:07:45 +02:00 committed by GitHub
commit be1e92a2ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 702 additions and 413 deletions

View File

@ -4067,6 +4067,17 @@ Result:
└─────┴─────┴───────┘
```
## splitby_max_substrings_includes_remaining_string {#splitby_max_substrings_includes_remaining_string}
Controls whether function [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with argument `max_substrings` > 0 will include the remaining string in the last element of the result array.
Possible values:
- `0` - The remaining string will not be included in the last element of the result array.
- `1` - The remaining string will be included in the last element of the result array. This is the behavior of Spark's [`split()`](https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.functions.split.html) function and Python's ['string.split()'](https://docs.python.org/3/library/stdtypes.html#str.split) method.
Default value: `0`
## enable_extended_results_for_datetime_functions {#enable-extended-results-for-datetime-functions}
Enables or disables returning results of type:

View File

@ -21,7 +21,7 @@ splitByChar(separator, s[, max_substrings]))
- `separator` — The separator which should contain exactly one character. [String](../../sql-reference/data-types/string.md).
- `s` — The string to split. [String](../../sql-reference/data-types/string.md).
- `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible.
- `max_substrings` — An optional `Int64` defaulting to 0. If `max_substrings` > 0, the returned array will contain at most `max_substrings` substrings, otherwise the function will return as many substrings as possible.
**Returned value(s)**
@ -38,6 +38,10 @@ The behavior of parameter `max_substrings` changed starting with ClickHouse v22.
For example,
- in v22.10: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b','c=d']`
- in v22.11: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b']`
A behavior similar to ClickHouse pre-v22.11 can be achieved by setting
[splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string)
`SELECT splitByChar('=', 'a=b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1 -- ['a', 'b=c=d']`
:::
**Example**
@ -80,6 +84,8 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere
- There are multiple consecutive non-empty separators;
- The original string `s` is empty while the separator is not empty.
Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0.
**Example**
``` sql
@ -133,6 +139,8 @@ Returns an array of selected substrings. Empty substrings may be selected when:
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0.
**Example**
``` sql
@ -182,6 +190,8 @@ Returns an array of selected substrings.
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0.
**Example**
``` sql
@ -219,6 +229,8 @@ Returns an array of selected substrings.
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0.
**Example**
``` sql
@ -279,6 +291,8 @@ Returns an array of selected substrings.
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0.
**Example**
``` sql

View File

@ -503,6 +503,7 @@ class IColumn;
M(Bool, reject_expensive_hyperscan_regexps, true, "Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion)", 0) \
M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \
M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \
M(Bool, splitby_max_substrings_includes_remaining_string, false, "Functions 'splitBy*()' with 'max_substrings' argument > 0 include the remaining string as last element in the result", 0) \
\
M(Bool, allow_execute_multiif_columnar, true, "Allow execute multiIf function columnar", 0) \
M(Bool, formatdatetime_f_prints_single_zero, false, "Formatter '%f' in function 'formatDateTime()' produces a single zero instead of six zeros if the formatted value has no fractional seconds.", 0) \

View File

@ -104,7 +104,7 @@ void validateArgumentType(const IFunction & func, const DataTypes & arguments,
const auto & argument = arguments[argument_index];
if (!validator_func(*argument))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of {} argument of function {} expected {}",
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of {} argument of function {}, expected {}",
argument->getName(), std::to_string(argument_index), func.getName(), expected_type_description);
}

View File

@ -5,23 +5,53 @@ namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_COLUMN;
}
DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const DataTypes & arguments) const
template <typename DataType>
std::optional<Int64> extractMaxSplitsImpl(const ColumnWithTypeAndName & argument)
{
if (arguments.size() != 1 && arguments.size() != 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be 1 or 2.",
getName(), arguments.size());
const auto * col = checkAndGetColumnConst<ColumnVector<DataType>>(argument.column.get());
if (!col)
return std::nullopt;
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[0].get());
if (!array_type)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be an array.", getName());
auto value = col->template getValue<DataType>();
return static_cast<Int64>(value);
}
if (arguments.size() == 2 && !isString(arguments[1]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument for function {} must be constant string.", getName());
std::optional<size_t> extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position)
{
if (max_substrings_argument_position >= arguments.size())
return std::nullopt;
std::optional<Int64> max_splits;
if (!((max_splits = extractMaxSplitsImpl<UInt8>(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl<Int8>(arguments[max_substrings_argument_position]))
|| (max_splits = extractMaxSplitsImpl<UInt16>(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl<Int16>(arguments[max_substrings_argument_position]))
|| (max_splits = extractMaxSplitsImpl<UInt32>(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl<Int32>(arguments[max_substrings_argument_position]))
|| (max_splits = extractMaxSplitsImpl<UInt64>(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl<Int64>(arguments[max_substrings_argument_position]))))
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Illegal column {}, which is {}-th argument",
arguments[max_substrings_argument_position].column->getName(),
max_substrings_argument_position + 1);
if (*max_splits <= 0)
return std::nullopt;
return max_splits;
}
DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const
{
FunctionArgumentDescriptors mandatory_args{
{"arr", &isArray<IDataType>, nullptr, "Array"},
};
FunctionArgumentDescriptors optional_args{
{"separator", &isString<IDataType>, isColumnConst, "const String"},
};
validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args);
return std::make_shared<DataTypeString>();
}

View File

@ -23,10 +23,8 @@ namespace DB
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
@ -56,6 +54,7 @@ namespace ErrorCodes
using Pos = const char *;
std::optional<size_t> extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position);
/// Substring generators. All of them have a common interface.
@ -64,9 +63,11 @@ class SplitByAlphaImpl
private:
Pos pos;
Pos end;
std::optional<size_t> max_splits;
size_t splits;
bool max_substrings_includes_remaining_string;
public:
/// Get the name of the function.
static constexpr auto name = "alphaTokens";
static String getName() { return name; }
@ -74,37 +75,33 @@ public:
static size_t getNumberOfArguments() { return 0; }
/// Check the type of the function's arguments.
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
if (arguments.empty() || arguments.size() > 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes one or two arguments", getName());
FunctionArgumentDescriptors mandatory_args{
{"s", &isString<IDataType>, nullptr, "String"},
};
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. "
"Must be String.", arguments[0]->getName(), getName());
FunctionArgumentDescriptors optional_args{
{"max_substrings", &isNativeInteger<IDataType>, isColumnConst, "const Number"},
};
validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args);
}
/// Initialize by the function arguments.
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
static constexpr auto strings_argument_position = 0uz;
void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
{
max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
max_splits = extractMaxSplits(arguments, 1);
}
/// Called for each next string.
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
}
/// Returns the position of the argument, that is the column of strings
static size_t getStringsArgumentPosition()
{
return 0;
}
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return 1;
splits = 0;
}
/// Get the next token, if any, or return false.
@ -119,10 +116,27 @@ public:
token_begin = pos;
if (max_splits)
{
if (max_substrings_includes_remaining_string)
{
if (splits == *max_splits - 1)
{
token_end = end;
pos = end;
return true;
}
}
else
if (splits == *max_splits)
return false;
}
while (pos < end && isAlphaASCII(*pos))
++pos;
token_end = pos;
++splits;
return true;
}
@ -133,6 +147,9 @@ class SplitByNonAlphaImpl
private:
Pos pos;
Pos end;
std::optional<size_t> max_splits;
size_t splits;
bool max_substrings_includes_remaining_string;
public:
/// Get the name of the function.
@ -142,37 +159,25 @@ public:
static bool isVariadic() { return true; }
static size_t getNumberOfArguments() { return 0; }
/// Check the type of the function's arguments.
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
if (arguments.empty() || arguments.size() > 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes one or two arguments", getName());
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. "
"Must be String.", arguments[0]->getName(), getName());
SplitByAlphaImpl::checkArguments(func, arguments);
}
/// Initialize by the function arguments.
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
static constexpr auto strings_argument_position = 0uz;
void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
{
max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
max_splits = extractMaxSplits(arguments, 1);
}
/// Called for each next string.
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
}
/// Returns the position of the argument, that is the column of strings
static size_t getStringsArgumentPosition()
{
return 0;
}
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return 1;
splits = 0;
}
/// Get the next token, if any, or return false.
@ -187,10 +192,27 @@ public:
token_begin = pos;
if (max_splits)
{
if (max_substrings_includes_remaining_string)
{
if (splits == *max_splits - 1)
{
token_end = end;
pos = end;
return true;
}
}
else
if (splits == *max_splits)
return false;
}
while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
++pos;
token_end = pos;
splits++;
return true;
}
@ -201,46 +223,36 @@ class SplitByWhitespaceImpl
private:
Pos pos;
Pos end;
std::optional<size_t> max_splits;
size_t splits;
bool max_substrings_includes_remaining_string;
public:
/// Get the name of the function.
static constexpr auto name = "splitByWhitespace";
static String getName() { return name; }
static bool isVariadic() { return true; }
static size_t getNumberOfArguments() { return 0; }
/// Check the type of the function's arguments.
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
if (arguments.empty() || arguments.size() > 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes one or two arguments", getName());
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. "
"Must be String.", arguments[0]->getName(), getName());
return SplitByNonAlphaImpl::checkArguments(func, arguments);
}
/// Initialize by the function arguments.
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
static constexpr auto strings_argument_position = 0uz;
void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
{
max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
max_splits = extractMaxSplits(arguments, 1);
}
/// Called for each next string.
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
}
/// Returns the position of the argument, that is the column of strings
static size_t getStringsArgumentPosition()
{
return 0;
}
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return 1;
splits = 0;
}
/// Get the next token, if any, or return false.
@ -255,10 +267,27 @@ public:
token_begin = pos;
if (max_splits)
{
if (max_substrings_includes_remaining_string)
{
if (splits == *max_splits - 1)
{
token_end = end;
pos = end;
return true;
}
}
else
if (splits == *max_splits)
return false;
}
while (pos < end && !isWhitespaceASCII(*pos))
++pos;
token_end = pos;
splits++;
return true;
}
@ -269,7 +298,10 @@ class SplitByCharImpl
private:
Pos pos;
Pos end;
char sep;
char separator;
std::optional<size_t> max_splits;
size_t splits;
bool max_substrings_includes_remaining_string;
public:
static constexpr auto name = "splitByChar";
@ -277,24 +309,23 @@ public:
static bool isVariadic() { return true; }
static size_t getNumberOfArguments() { return 0; }
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
if (arguments.size() < 2 || arguments.size() > 3)
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Function '{}' needs at least 2 arguments, at most 3 arguments; passed {}.",
name, arguments.size());
FunctionArgumentDescriptors mandatory_args{
{"separator", &isString<IDataType>, isColumnConst, "const String"},
{"s", &isString<IDataType>, nullptr, "String"}
};
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. "
"Must be String.", arguments[0]->getName(), getName());
FunctionArgumentDescriptors optional_args{
{"max_substrings", &isNativeInteger<IDataType>, isColumnConst, "const Number"},
};
if (!isString(arguments[1]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of second argument of function {}. "
"Must be String.", arguments[1]->getName(), getName());
validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args);
}
void init(const ColumnsWithTypeAndName & arguments)
static constexpr auto strings_argument_position = 1uz;
void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
{
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
@ -307,25 +338,17 @@ public:
if (sep_str.size() != 1)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal separator for function {}. Must be exactly one byte.", getName());
sep = sep_str[0];
}
separator = sep_str[0];
/// Returns the position of the argument, that is the column of strings
static size_t getStringsArgumentPosition()
{
return 1;
}
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return 2;
max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
max_splits = extractMaxSplits(arguments, 2);
}
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
splits = 0;
}
bool get(Pos & token_begin, Pos & token_end)
@ -334,12 +357,29 @@ public:
return false;
token_begin = pos;
pos = reinterpret_cast<Pos>(memchr(pos, sep, end - pos));
if (max_splits)
{
if (max_substrings_includes_remaining_string)
{
if (splits == *max_splits - 1)
{
token_end = end;
pos = nullptr;
return true;
}
}
else
if (splits == *max_splits)
return false;
}
pos = reinterpret_cast<Pos>(memchr(pos, separator, end - pos));
if (pos)
{
token_end = pos;
++pos;
++splits;
}
else
token_end = end;
@ -354,8 +394,10 @@ class SplitByStringImpl
private:
Pos pos;
Pos end;
String sep;
String separator;
std::optional<size_t> max_splits;
size_t splits;
bool max_substrings_includes_remaining_string;
public:
static constexpr auto name = "splitByString";
@ -363,12 +405,14 @@ public:
static bool isVariadic() { return true; }
static size_t getNumberOfArguments() { return 0; }
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
SplitByCharImpl::checkArguments(arguments);
SplitByCharImpl::checkArguments(func, arguments);
}
void init(const ColumnsWithTypeAndName & arguments)
static constexpr auto strings_argument_position = 1uz;
void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
{
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
@ -376,19 +420,10 @@ public:
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. "
"Must be constant string.", arguments[0].column->getName(), getName());
sep = col->getValue<String>();
}
separator = col->getValue<String>();
/// Returns the position of the argument that is the column of strings
static size_t getStringsArgumentPosition()
{
return 1;
}
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return 2;
max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
max_splits = extractMaxSplits(arguments, 2);
}
/// Called for each next string.
@ -396,19 +431,38 @@ public:
{
pos = pos_;
end = end_;
splits = 0;
}
/// Get the next token, if any, or return false.
bool get(Pos & token_begin, Pos & token_end)
{
if (sep.empty())
if (separator.empty())
{
if (pos == end)
return false;
token_begin = pos;
if (max_splits)
{
if (max_substrings_includes_remaining_string)
{
if (splits == *max_splits - 1)
{
token_end = end;
pos = end;
return true;
}
}
else
if (splits == *max_splits)
return false;
}
pos += 1;
token_end = pos;
++splits;
}
else
{
@ -417,12 +471,28 @@ public:
token_begin = pos;
pos = reinterpret_cast<Pos>(memmem(pos, end - pos, sep.data(), sep.size()));
if (max_splits)
{
if (max_substrings_includes_remaining_string)
{
if (splits == *max_splits - 1)
{
token_end = end;
pos = nullptr;
return true;
}
}
else
if (splits == *max_splits)
return false;
}
pos = reinterpret_cast<Pos>(memmem(pos, end - pos, separator.data(), separator.size()));
if (pos)
{
token_end = pos;
pos += sep.size();
pos += separator.size();
++splits;
}
else
token_end = end;
@ -441,6 +511,10 @@ private:
Pos pos;
Pos end;
std::optional<size_t> max_splits;
size_t splits;
bool max_substrings_includes_remaining_string;
public:
static constexpr auto name = "splitByRegexp";
static String getName() { return name; }
@ -448,14 +522,14 @@ public:
static bool isVariadic() { return true; }
static size_t getNumberOfArguments() { return 0; }
/// Check the type of function arguments.
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
SplitByStringImpl::checkArguments(arguments);
SplitByStringImpl::checkArguments(func, arguments);
}
/// Initialize by the function arguments.
void init(const ColumnsWithTypeAndName & arguments)
static constexpr auto strings_argument_position = 1uz;
void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
{
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
@ -465,18 +539,9 @@ public:
if (!col->getValue<String>().empty())
re = std::make_shared<OptimizedRegularExpression>(Regexps::createRegexp<false, false, false>(col->getValue<String>()));
}
/// Returns the position of the argument that is the column of strings
static size_t getStringsArgumentPosition()
{
return 1;
}
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return 2;
max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
max_splits = extractMaxSplits(arguments, 2);
}
/// Called for each next string.
@ -484,6 +549,7 @@ public:
{
pos = pos_;
end = end_;
splits = 0;
}
/// Get the next token, if any, or return false.
@ -495,8 +561,26 @@ public:
return false;
token_begin = pos;
if (max_splits)
{
if (max_substrings_includes_remaining_string)
{
if (splits == *max_splits - 1)
{
token_end = end;
pos = end;
return true;
}
}
else
if (splits == *max_splits)
return false;
}
pos += 1;
token_end = pos;
++splits;
}
else
{
@ -505,6 +589,22 @@ public:
token_begin = pos;
if (max_splits)
{
if (max_substrings_includes_remaining_string)
{
if (splits == *max_splits - 1)
{
token_end = end;
pos = nullptr;
return true;
}
}
else
if (splits == *max_splits)
return false;
}
if (!re->match(pos, end - pos, matches) || !matches[0].length)
{
token_end = end;
@ -514,6 +614,7 @@ public:
{
token_end = pos + matches[0].offset;
pos = token_end + matches[0].length;
++splits;
}
}
@ -536,14 +637,19 @@ public:
static bool isVariadic() { return false; }
static size_t getNumberOfArguments() { return 2; }
/// Check the type of function arguments.
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
SplitByStringImpl::checkArguments(arguments);
FunctionArgumentDescriptors mandatory_args{
{"haystack", &isString<IDataType>, nullptr, "String"},
{"pattern", &isString<IDataType>, isColumnConst, "const String"}
};
validateFunctionArgumentTypes(func, arguments, mandatory_args);
}
/// Initialize by the function arguments.
void init(const ColumnsWithTypeAndName & arguments)
static constexpr auto strings_argument_position = 0uz;
void init(const ColumnsWithTypeAndName & arguments, bool /*max_substrings_includes_remaining_string*/)
{
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
@ -557,18 +663,6 @@ public:
matches.resize(capture + 1);
}
/// Returns the position of the argument that is the column of strings
static size_t getStringsArgumentPosition()
{
return 0;
}
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return std::nullopt;
}
/// Called for each next string.
void set(Pos pos_, Pos end_)
{
@ -607,33 +701,30 @@ public:
template <typename Generator>
class FunctionTokens : public IFunction
{
private:
bool max_substrings_includes_remaining_string;
public:
static constexpr auto name = Generator::name;
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionTokens>(); }
static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionTokens>(context); }
String getName() const override
explicit FunctionTokens<Generator>(ContextPtr context)
{
return name;
const Settings & settings = context->getSettingsRef();
max_substrings_includes_remaining_string = settings.splitby_max_substrings_includes_remaining_string;
}
String getName() const override { return name; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
bool isVariadic() const override { return Generator::isVariadic(); }
size_t getNumberOfArguments() const override { return Generator::getNumberOfArguments(); }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
Generator::checkArguments(arguments);
const auto max_substrings_pos = Generator::getMaxSubstringsArgumentPosition();
if (max_substrings_pos && *max_substrings_pos < arguments.size() && !isNativeInteger(arguments[*max_substrings_pos]))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"{}-th argument for function '{}' must be integer, got '{}' instead",
*max_substrings_pos + 1,
getName(),
arguments[*max_substrings_pos]->getName());
Generator::checkArguments(*this, arguments);
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
}
@ -641,23 +732,21 @@ public:
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
{
Generator generator;
generator.init(arguments);
const auto & array_argument = arguments[generator.getStringsArgumentPosition()];
generator.init(arguments, max_substrings_includes_remaining_string);
/// Whether we need to limit max tokens returned by Generator::get
/// If max_substrings is std::nullopt, no limit is applied.
auto max_substrings = getMaxSubstrings(arguments);
const auto & array_argument = arguments[generator.strings_argument_position];
const ColumnString * col_str = checkAndGetColumn<ColumnString>(array_argument.column.get());
const ColumnConst * col_const_str =
checkAndGetColumnConstStringOrFixedString(array_argument.column.get());
const ColumnConst * col_str_const = checkAndGetColumnConstStringOrFixedString(array_argument.column.get());
auto col_res = ColumnArray::create(ColumnString::create());
ColumnString & res_strings = typeid_cast<ColumnString &>(col_res->getData());
ColumnArray::Offsets & res_offsets = col_res->getOffsets();
ColumnString::Chars & res_strings_chars = res_strings.getChars();
ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets();
ColumnArray::Offsets & res_offsets = col_res->getOffsets();
if (col_str)
{
const ColumnString::Chars & src_chars = col_str->getChars();
@ -682,7 +771,7 @@ public:
generator.set(pos, end);
size_t j = 0;
while (generator.get(token_begin, token_end) && !(max_substrings && j >= *max_substrings))
while (generator.get(token_begin, token_end))
{
size_t token_size = token_end - token_begin;
@ -701,65 +790,24 @@ public:
return col_res;
}
else if (col_const_str)
else if (col_str_const)
{
String src = col_const_str->getValue<String>();
String src = col_str_const->getValue<String>();
Array dst;
generator.set(src.data(), src.data() + src.size());
Pos token_begin = nullptr;
Pos token_end = nullptr;
while (generator.get(token_begin, token_end) && !(max_substrings && dst.size() >= *max_substrings))
while (generator.get(token_begin, token_end))
dst.push_back(String(token_begin, token_end - token_begin));
return result_type->createColumnConst(col_const_str->size(), dst);
return result_type->createColumnConst(col_str_const->size(), dst);
}
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns {}, {} of arguments of function {}",
array_argument.column->getName(), array_argument.column->getName(), getName());
}
private:
template <typename DataType>
std::optional<Int64> getMaxSubstringsImpl(const ColumnWithTypeAndName & argument) const
{
const auto * col = checkAndGetColumnConst<ColumnVector<DataType>>(argument.column.get());
if (!col)
return {};
auto value = col->template getValue<DataType>();
return static_cast<Int64>(value);
}
std::optional<size_t> getMaxSubstrings(const ColumnsWithTypeAndName & arguments) const
{
const auto pos = Generator::getMaxSubstringsArgumentPosition();
if (!pos)
return std::nullopt;
if (*pos >= arguments.size())
return std::nullopt;
std::optional<Int64> max_substrings;
if (!((max_substrings = getMaxSubstringsImpl<UInt8>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int8>(arguments[*pos]))
|| (max_substrings = getMaxSubstringsImpl<UInt16>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int16>(arguments[*pos]))
|| (max_substrings = getMaxSubstringsImpl<UInt32>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int32>(arguments[*pos]))
|| (max_substrings = getMaxSubstringsImpl<UInt64>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int64>(arguments[*pos]))))
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Illegal column {}, which is {}-th argument of function {}",
arguments[*pos].column->getName(),
*pos + 1,
getName());
/// If max_substrings is negative or zero, tokenize will be applied as many times as possible, which is equivalent to
/// no max_substrings argument in function
if (max_substrings && *max_substrings <= 0)
return std::nullopt;
return *max_substrings;
}
};
@ -803,7 +851,7 @@ private:
/// Loop through the rows within the array. /// NOTE You can do everything in one copy, if the separator has a size of 1.
for (auto next_src_array_offset = src_array_offsets[i]; current_src_array_offset < next_src_array_offset; ++current_src_array_offset)
{
if (unlikely(null_map && null_map[current_src_array_offset]))
if (null_map && null_map[current_src_array_offset]) [[unlikely]]
continue;
if (!first_non_null)
@ -881,7 +929,7 @@ public:
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override;
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override;
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
{

View File

@ -3,10 +3,6 @@
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
class URLPathHierarchyImpl
{
@ -23,26 +19,18 @@ public:
static bool isVariadic() { return false; }
static size_t getNumberOfArguments() { return 1; }
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. "
"Must be String.", arguments[0]->getName(), getName());
FunctionArgumentDescriptors mandatory_args{
{"URL", &isString<IDataType>, nullptr, "String"},
};
validateFunctionArgumentTypes(func, arguments, mandatory_args);
}
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
static constexpr auto strings_argument_position = 0uz;
/// Returns the position of the argument that is the column of rows
static size_t getStringsArgumentPosition()
{
return 0;
}
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return std::nullopt;
}
void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substrings_includes_remaining_string*/) {}
/// Called for each next string.
void set(Pos pos_, Pos end_)

View File

@ -3,10 +3,6 @@
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
class URLHierarchyImpl
{
@ -22,26 +18,18 @@ public:
static bool isVariadic() { return false; }
static size_t getNumberOfArguments() { return 1; }
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. "
"Must be String.", arguments[0]->getName(), getName());
FunctionArgumentDescriptors mandatory_args{
{"URL", &isString<IDataType>, nullptr, "String"},
};
validateFunctionArgumentTypes(func, arguments, mandatory_args);
}
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
static constexpr auto strings_argument_position = 0uz;
/// Returns the position of the argument that is the column of rows
static size_t getStringsArgumentPosition()
{
return 0;
}
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return std::nullopt;
}
void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substring_behavior*/) {}
/// Called for each next string.
void set(Pos pos_, Pos end_)

View File

@ -3,10 +3,6 @@
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
class ExtractURLParameterNamesImpl
{
@ -22,27 +18,18 @@ public:
static bool isVariadic() { return false; }
static size_t getNumberOfArguments() { return 1; }
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. "
"Must be String.", arguments[0]->getName(), getName());
FunctionArgumentDescriptors mandatory_args{
{"URL", &isString<IDataType>, nullptr, "String"},
};
validateFunctionArgumentTypes(func, arguments, mandatory_args);
}
/// Returns the position of the argument that is the column of rows
static size_t getStringsArgumentPosition()
{
return 0;
}
static constexpr auto strings_argument_position = 0uz;
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return std::nullopt;
}
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substrings_includes_remaining_string*/) {}
/// Called for each next string.
void set(Pos pos_, Pos end_)

View File

@ -3,10 +3,6 @@
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
class ExtractURLParametersImpl
{
@ -22,26 +18,18 @@ public:
static bool isVariadic() { return false; }
static size_t getNumberOfArguments() { return 1; }
static void checkArguments(const DataTypes & arguments)
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. "
"Must be String.", arguments[0]->getName(), getName());
FunctionArgumentDescriptors mandatory_args{
{"URL", &isString<IDataType>, nullptr, "String"},
};
validateFunctionArgumentTypes(func, arguments, mandatory_args);
}
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substrings_includes_remaining_string*/) {}
/// Returns the position of the argument that is the column of rows
static size_t getStringsArgumentPosition()
{
return 0;
}
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
static std::optional<size_t> getMaxSubstringsArgumentPosition()
{
return std::nullopt;
}
static constexpr auto strings_argument_position = 0uz;
/// Called for each next string.
void set(Pos pos_, Pos end_)

View File

@ -1,44 +1,160 @@
['1','2','3']
['1','2','3']
['1','2','3']
['1']
['1','2']
['1','2','3']
['1','2','3']
['one','two','three','']
['one','two','three','']
['one','two','three','']
['one']
['one','two']
['one','two','three']
['one','two','three','']
['one','two','three','']
['abca','abc']
['abca','abc']
['abca','abc']
['abca']
['abca','abc']
['abca','abc']
['abca','abc']
['1','a','b']
['1','a','b']
['1','a','b']
['1']
['1','a']
['1','a','b']
['1','a','b']
['1!','a,','b.']
['1!','a,','b.']
['1!','a,','b.']
['1!']
['1!','a,']
['1!','a,','b.']
['1!','a,','b.']
['1','2 3','4,5','abcde']
['1','2 3','4,5','abcde']
['1','2 3','4,5','abcde']
['1']
['1','2 3']
['1','2 3','4,5']
['1','2 3','4,5','abcde']
['1','2 3','4,5','abcde']
-- negative tests
-- splitByChar
-- (default)
['a','','b','c','d']
['a','','b','c','d']
['a','','b','c','d']
['a']
['a','']
['a','','b']
['a','','b','c']
['a','','b','c','d']
['a','','b','c','d']
-- (include remainder)
['a','','b','c','d']
['a','','b','c','d']
['a','','b','c','d']
['a==b=c=d']
['a','=b=c=d']
['a','','b=c=d']
['a','','b','c=d']
['a','','b','c','d']
['a','','b','c','d']
-- splitByString
-- (default)
['a','=','=','b','=','c','=','d']
['a','=','=','b','=','c','=','d']
['a','=','=','b','=','c','=','d']
['a']
['a','=']
['a','=','=']
['a','=','=','b']
['a','=','=','b','=']
['a','=','=','b','=','c']
['a','=','=','b','=','c','=']
['a','=','=','b','=','c','=']
['a','=','=','b','=','c','=','d']
['a','=','=','b','=','c','=','d']
['a','','b','c','d']
['a','','b','c','d']
['a','','b','c','d']
['a']
['a','']
['a','','b']
['a','','b','c']
['a','','b','c','d']
['a','','b','c','d']
-- (include remainder)
['a','=','=','b','=','c','=','d']
['a','=','=','b','=','c','=','d']
['a','=','=','b','=','c','=','d']
['a==b=c=d']
['a','==b=c=d']
['a','=','=b=c=d']
['a','=','=','b=c=d']
['a','=','=','b','=c=d']
['a','=','=','b','=','c=d']
['a','=','=','b','=','c','=d']
['a','=','=','b','=','c','=','d']
['a','=','=','b','=','c','=','d']
['a','','b','c','d']
['a','','b','c','d']
['a','','b','c','d']
['a==b=c=d']
['a','=b=c=d']
['a','','b=c=d']
['a','','b','c=d']
['a','','b','c','d']
['a','','b','c','d']
-- splitByRegexp
-- (default)
['a','bc','de','f']
['a','bc','de','f']
['a','bc','de','f']
['a']
['a','bc']
['a','bc','de']
['a','bc','de','f']
['a','bc','de','f']
['a','1','2','b','c','2','3','d','e','3','4','5','f']
['a','1','2','b','c','2','3','d','e','3','4','5','f']
['a','1','2','b','c','2','3','d','e','3','4','5','f']
['a']
['a','1']
['a','1','2']
['a','1','2','b']
['a','1','2','b','c']
-- (include remainder)
['a','1','2','b','c','2','3','d','e','3','4','5','f']
['a','1','2','b','c','2','3','d','e','3','4','5','f']
['a','1','2','b','c','2','3','d','e','3','4','5','f']
['a12bc23de345f']
['a','12bc23de345f']
['a','1','2bc23de345f']
['a','1','2','bc23de345f']
['a','1','2','b','c23de345f']
['a','bc','de','f']
['a','bc','de','f']
['a','bc','de','f']
['a12bc23de345f']
['a','bc23de345f']
['a','bc','de345f']
['a','bc','de','f']
['a','bc','de','f']
-- splitByAlpha
-- (default)
['ab','cd','ef','gh']
['ab','cd','ef','gh']
['ab','cd','ef','gh']
['ab']
['ab','cd']
['ab','cd','ef']
['ab','cd','ef','gh']
['ab','cd','ef','gh']
-- (include remainder)
['ab','cd','ef','gh']
['ab','cd','ef','gh']
['ab','cd','ef','gh']
['ab.cd.ef.gh']
['ab','cd.ef.gh']
['ab','cd','ef.gh']
['ab','cd','ef','gh']
['ab','cd','ef','gh']
-- splitByNonAlpha
-- (default)
['128','0','0','1']
['128','0','0','1']
['128','0','0','1']
['128']
['128','0']
['128','0','0']
['128','0','0','1']
['128','0','0','1']
-- (include remainder)
['128','0','0','1']
['128','0','0','1']
['128','0','0','1']
['128.0.0.1']
['128','0.0.1']
['128','0','0.1']
['128','0','0','1']
['128','0','0','1']
-- splitByWhitespace
-- (default)
['Nein,','nein,','nein!','Doch!']
['Nein,','nein,','nein!','Doch!']
['Nein,','nein,','nein!','Doch!']
['Nein,']
['Nein,','nein,']
['Nein,','nein,','nein!']
['Nein,','nein,','nein!','Doch!']
['Nein,','nein,','nein!','Doch!']
-- (include remainder)
['Nein,','nein,','nein!','Doch!']
['Nein,','nein,','nein!','Doch!']
['Nein,','nein,','nein!','Doch!']
['Nein, nein, nein! Doch!']
['Nein,','nein, nein! Doch!']
['Nein,','nein,','nein! Doch!']
['Nein,','nein,','nein!','Doch!']
['Nein,','nein,','nein!','Doch!']

View File

@ -1,59 +1,175 @@
select splitByChar(',', '1,2,3');
select splitByChar(',', '1,2,3', -1);
select splitByChar(',', '1,2,3', 0);
select splitByChar(',', '1,2,3', 1);
select splitByChar(',', '1,2,3', 2);
select splitByChar(',', '1,2,3', 3);
select splitByChar(',', '1,2,3', 4);
select splitByRegexp('[ABC]', 'oneAtwoBthreeC');
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', -1);
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 0);
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 1);
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 2);
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 3);
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 4);
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', 5);
SELECT alphaTokens('abca1abc');
SELECT alphaTokens('abca1abc', -1);
SELECT alphaTokens('abca1abc', 0);
SELECT alphaTokens('abca1abc', 1);
SELECT alphaTokens('abca1abc', 2);
SELECT alphaTokens('abca1abc', 3);
SELECT splitByAlpha('abca1abc');
SELECT splitByNonAlpha(' 1! a, b. ');
SELECT splitByNonAlpha(' 1! a, b. ', -1);
SELECT splitByNonAlpha(' 1! a, b. ', 0);
SELECT splitByNonAlpha(' 1! a, b. ', 1);
SELECT splitByNonAlpha(' 1! a, b. ', 2);
SELECT splitByNonAlpha(' 1! a, b. ', 3);
SELECT splitByNonAlpha(' 1! a, b. ', 4);
SELECT splitByWhitespace(' 1! a, b. ');
SELECT splitByWhitespace(' 1! a, b. ', -1);
SELECT splitByWhitespace(' 1! a, b. ', 0);
SELECT splitByWhitespace(' 1! a, b. ', 1);
SELECT splitByWhitespace(' 1! a, b. ', 2);
SELECT splitByWhitespace(' 1! a, b. ', 3);
SELECT splitByWhitespace(' 1! a, b. ', 4);
SELECT splitByString(', ', '1, 2 3, 4,5, abcde');
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', -1);
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 0);
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 1);
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 2);
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 3);
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 4);
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', 5);
select splitByChar(',', '1,2,3', ''); -- { serverError 43 }
select splitByRegexp('[ABC]', 'oneAtwoBthreeC', ''); -- { serverError 43 }
SELECT '-- negative tests';
SELECT splitByChar(',', '1,2,3', ''); -- { serverError 43 }
SELECT splitByRegexp('[ABC]', 'oneAtwoBthreeC', ''); -- { serverError 43 }
SELECT alphaTokens('abca1abc', ''); -- { serverError 43 }
SELECT splitByAlpha('abca1abc', ''); -- { serverError 43 }
SELECT splitByNonAlpha(' 1! a, b. ', ''); -- { serverError 43 }
SELECT splitByWhitespace(' 1! a, b. ', ''); -- { serverError 43 }
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', ''); -- { serverError 43 }
SELECT splitByString(', ', '1, 2 3, 4,5, abcde', ''); -- { serverError 43 }
SELECT '-- splitByChar';
SELECT '-- (default)';
SELECT splitByChar('=', 'a==b=c=d');
SELECT splitByChar('=', 'a==b=c=d', -1);
SELECT splitByChar('=', 'a==b=c=d', 0);
SELECT splitByChar('=', 'a==b=c=d', 1);
SELECT splitByChar('=', 'a==b=c=d', 2);
SELECT splitByChar('=', 'a==b=c=d', 3);
SELECT splitByChar('=', 'a==b=c=d', 4);
SELECT splitByChar('=', 'a==b=c=d', 5);
SELECT splitByChar('=', 'a==b=c=d', 6);
SELECT '-- (include remainder)';
SELECT splitByChar('=', 'a==b=c=d') SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByChar('=', 'a==b=c=d', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByChar('=', 'a==b=c=d', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByChar('=', 'a==b=c=d', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByChar('=', 'a==b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByChar('=', 'a==b=c=d', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByChar('=', 'a==b=c=d', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByChar('=', 'a==b=c=d', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByChar('=', 'a==b=c=d', 6) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT '-- splitByString';
SELECT '-- (default)';
SELECT splitByString('', 'a==b=c=d') SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 6) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 7) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 7) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 8) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('', 'a==b=c=d', 9) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('=', 'a==b=c=d') SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('=', 'a==b=c=d', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('=', 'a==b=c=d', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('=', 'a==b=c=d', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('=', 'a==b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('=', 'a==b=c=d', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('=', 'a==b=c=d', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('=', 'a==b=c=d', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT splitByString('=', 'a==b=c=d', 6) SETTINGS splitby_max_substrings_includes_remaining_string = 0;
SELECT '-- (include remainder)';
SELECT splitByString('', 'a==b=c=d') SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', 6) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', 7) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', 8) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('', 'a==b=c=d', 9) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('=', 'a==b=c=d') SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('=', 'a==b=c=d', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('=', 'a==b=c=d', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('=', 'a==b=c=d', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('=', 'a==b=c=d', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('=', 'a==b=c=d', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('=', 'a==b=c=d', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('=', 'a==b=c=d', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByString('=', 'a==b=c=d', 6) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT '-- splitByRegexp';
SELECT '-- (default)';
SELECT splitByRegexp('\\d+', 'a12bc23de345f');
SELECT splitByRegexp('\\d+', 'a12bc23de345f', -1);
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 0);
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 1);
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2);
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 3);
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 4);
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 5);
SELECT splitByRegexp('', 'a12bc23de345f');
SELECT splitByRegexp('', 'a12bc23de345f', -1);
SELECT splitByRegexp('', 'a12bc23de345f', 0);
SELECT splitByRegexp('', 'a12bc23de345f', 1);
SELECT splitByRegexp('', 'a12bc23de345f', 2);
SELECT splitByRegexp('', 'a12bc23de345f', 3);
SELECT splitByRegexp('', 'a12bc23de345f', 4);
SELECT splitByRegexp('', 'a12bc23de345f', 5);
SELECT '-- (include remainder)';
SELECT splitByRegexp('', 'a12bc23de345f') SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('', 'a12bc23de345f', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('', 'a12bc23de345f', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('', 'a12bc23de345f', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('', 'a12bc23de345f', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('', 'a12bc23de345f', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('', 'a12bc23de345f', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('\\d+', 'a12bc23de345f') SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('\\d+', 'a12bc23de345f', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT '-- splitByAlpha';
SELECT '-- (default)';
SELECT splitByAlpha('ab.cd.ef.gh');
SELECT splitByAlpha('ab.cd.ef.gh', -1);
SELECT splitByAlpha('ab.cd.ef.gh', 0);
SELECT splitByAlpha('ab.cd.ef.gh', 1);
SELECT splitByAlpha('ab.cd.ef.gh', 2);
SELECT splitByAlpha('ab.cd.ef.gh', 3);
SELECT splitByAlpha('ab.cd.ef.gh', 4);
SELECT splitByAlpha('ab.cd.ef.gh', 5);
SELECT '-- (include remainder)';
SELECT splitByAlpha('ab.cd.ef.gh') SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByAlpha('ab.cd.ef.gh', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByAlpha('ab.cd.ef.gh', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByAlpha('ab.cd.ef.gh', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByAlpha('ab.cd.ef.gh', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByAlpha('ab.cd.ef.gh', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByAlpha('ab.cd.ef.gh', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByAlpha('ab.cd.ef.gh', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT '-- splitByNonAlpha';
SELECT '-- (default)';
SELECT splitByNonAlpha('128.0.0.1');
SELECT splitByNonAlpha('128.0.0.1', -1);
SELECT splitByNonAlpha('128.0.0.1', 0);
SELECT splitByNonAlpha('128.0.0.1', 1);
SELECT splitByNonAlpha('128.0.0.1', 2);
SELECT splitByNonAlpha('128.0.0.1', 3);
SELECT splitByNonAlpha('128.0.0.1', 4);
SELECT splitByNonAlpha('128.0.0.1', 5);
SELECT '-- (include remainder)';
SELECT splitByNonAlpha('128.0.0.1') SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByNonAlpha('128.0.0.1', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByNonAlpha('128.0.0.1', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByNonAlpha('128.0.0.1', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByNonAlpha('128.0.0.1', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByNonAlpha('128.0.0.1', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByNonAlpha('128.0.0.1', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByNonAlpha('128.0.0.1', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
--
--
SELECT '-- splitByWhitespace';
SELECT '-- (default)';
SELECT splitByWhitespace('Nein, nein, nein! Doch!');
SELECT splitByWhitespace('Nein, nein, nein! Doch!', -1);
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 0);
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 1);
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2);
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 3);
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 4);
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 5);
SELECT '-- (include remainder)';
SELECT splitByWhitespace('Nein, nein, nein! Doch!') SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByWhitespace('Nein, nein, nein! Doch!', -1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 0) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 1) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 3) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 4) SETTINGS splitby_max_substrings_includes_remaining_string = 1;
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 5) SETTINGS splitby_max_substrings_includes_remaining_string = 1;

View File

@ -2215,6 +2215,8 @@ sparkBar
sparkbar
sparsehash
speedscope
splitBy
splitby
splitByChar
splitByNonAlpha
splitByRegexp