mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Pythonic vs Spark splitting
This commit is contained in:
parent
68cdfbcc6c
commit
b5b2cc511b
@ -4067,6 +4067,17 @@ Result:
|
|||||||
└─────┴─────┴───────┘
|
└─────┴─────┴───────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## split_tokens_like_python {#split-tokens-like-python}
|
||||||
|
|
||||||
|
Controls if functions [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with `max_substring` argument > 0 include the remaining string (if any) in the result array (Python semantics) or not (Spark semantics).
|
||||||
|
|
||||||
|
Possible values:
|
||||||
|
|
||||||
|
- 0 - Don't include the remaining string (Spark semantics).
|
||||||
|
- 1 - Include the remaining string (Python semantics).
|
||||||
|
|
||||||
|
Default value: `0`.
|
||||||
|
|
||||||
## enable_extended_results_for_datetime_functions {#enable-extended-results-for-datetime-functions}
|
## enable_extended_results_for_datetime_functions {#enable-extended-results-for-datetime-functions}
|
||||||
|
|
||||||
Enables or disables returning results of type:
|
Enables or disables returning results of type:
|
||||||
|
@ -38,6 +38,8 @@ The behavior of parameter `max_substrings` changed starting with ClickHouse v22.
|
|||||||
For example,
|
For example,
|
||||||
- in v22.10: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b','c=d']`
|
- in v22.10: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b','c=d']`
|
||||||
- in v22.11: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b']`
|
- in v22.11: `SELECT splitByChar('=', 'a=b=c=d', 2); -- ['a','b']`
|
||||||
|
|
||||||
|
The previous behavior can be restored by setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) = 1.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
@ -80,6 +82,8 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere
|
|||||||
- There are multiple consecutive non-empty separators;
|
- There are multiple consecutive non-empty separators;
|
||||||
- The original string `s` is empty while the separator is not empty.
|
- The original string `s` is empty while the separator is not empty.
|
||||||
|
|
||||||
|
Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not.
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
@ -133,6 +137,8 @@ Returns an array of selected substrings. Empty substrings may be selected when:
|
|||||||
|
|
||||||
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
|
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
|
||||||
|
|
||||||
|
Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not.
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
@ -182,6 +188,8 @@ Returns an array of selected substrings.
|
|||||||
|
|
||||||
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
|
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
|
||||||
|
|
||||||
|
Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not.
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
@ -219,6 +227,8 @@ Returns an array of selected substrings.
|
|||||||
|
|
||||||
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
|
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
|
||||||
|
|
||||||
|
Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not.
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
@ -279,6 +289,8 @@ Returns an array of selected substrings.
|
|||||||
|
|
||||||
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
|
Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)).
|
||||||
|
|
||||||
|
Setting [split_tokens_like_python](../../operations/settings/settings.md#split-tokens-like-python) (default: 0) controls whether with `max_substrings` > 0, the remaining string (if any) is included in the result array or not.
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
|
@ -502,6 +502,7 @@ class IColumn;
|
|||||||
M(Bool, reject_expensive_hyperscan_regexps, true, "Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion)", 0) \
|
M(Bool, reject_expensive_hyperscan_regexps, true, "Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion)", 0) \
|
||||||
M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \
|
M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \
|
||||||
M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \
|
M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \
|
||||||
|
M(Bool, split_tokens_like_python, false, "If true, then functions splitBy*() with given max_substring argument include remaining string in the result (Python semantics) or not (Spark semantics).", 0) \
|
||||||
\
|
\
|
||||||
M(Bool, allow_execute_multiif_columnar, true, "Allow execute multiIf function columnar", 0) \
|
M(Bool, allow_execute_multiif_columnar, true, "Allow execute multiIf function columnar", 0) \
|
||||||
M(Bool, formatdatetime_f_prints_single_zero, false, "Formatter '%f' in function 'formatDateTime()' produces a single zero instead of six zeros if the formatted value has no fractional seconds.", 0) \
|
M(Bool, formatdatetime_f_prints_single_zero, false, "Formatter '%f' in function 'formatDateTime()' produces a single zero instead of six zeros if the formatted value has no fractional seconds.", 0) \
|
||||||
|
@ -9,6 +9,40 @@ namespace ErrorCodes
|
|||||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename DataType>
|
||||||
|
std::optional<Int64> extractMaxSplitsImpl(const ColumnWithTypeAndName & argument)
|
||||||
|
{
|
||||||
|
const auto * col = checkAndGetColumnConst<ColumnVector<DataType>>(argument.column.get());
|
||||||
|
if (!col)
|
||||||
|
return std::nullopt;
|
||||||
|
|
||||||
|
auto value = col->template getValue<DataType>();
|
||||||
|
return static_cast<Int64>(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::optional<size_t> extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position)
|
||||||
|
{
|
||||||
|
if (max_substrings_argument_position >= arguments.size())
|
||||||
|
return std::nullopt;
|
||||||
|
|
||||||
|
std::optional<Int64> max_splits;
|
||||||
|
if (!((max_splits = extractMaxSplitsImpl<UInt8>(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl<Int8>(arguments[max_substrings_argument_position]))
|
||||||
|
|| (max_splits = extractMaxSplitsImpl<UInt16>(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl<Int16>(arguments[max_substrings_argument_position]))
|
||||||
|
|| (max_splits = extractMaxSplitsImpl<UInt32>(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl<Int32>(arguments[max_substrings_argument_position]))
|
||||||
|
|| (max_splits = extractMaxSplitsImpl<UInt64>(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl<Int64>(arguments[max_substrings_argument_position]))))
|
||||||
|
throw Exception(
|
||||||
|
ErrorCodes::ILLEGAL_COLUMN,
|
||||||
|
"Illegal column {}, which is {}-th argument",// of function {}",
|
||||||
|
arguments[max_substrings_argument_position].column->getName(),
|
||||||
|
max_substrings_argument_position + 1);//,
|
||||||
|
/// getName());
|
||||||
|
|
||||||
|
if (max_splits && *max_splits <= 0)
|
||||||
|
return std::nullopt;
|
||||||
|
|
||||||
|
return max_splits;
|
||||||
|
}
|
||||||
|
|
||||||
DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const
|
DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const
|
||||||
{
|
{
|
||||||
FunctionArgumentDescriptors mandatory_args{
|
FunctionArgumentDescriptors mandatory_args{
|
||||||
|
@ -56,6 +56,13 @@ namespace ErrorCodes
|
|||||||
|
|
||||||
using Pos = const char *;
|
using Pos = const char *;
|
||||||
|
|
||||||
|
std::optional<size_t> extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position);
|
||||||
|
|
||||||
|
enum class SplitTokenMode
|
||||||
|
{
|
||||||
|
LikeSpark,
|
||||||
|
LikePython
|
||||||
|
};
|
||||||
|
|
||||||
/// Substring generators. All of them have a common interface.
|
/// Substring generators. All of them have a common interface.
|
||||||
|
|
||||||
@ -64,6 +71,9 @@ class SplitByAlphaImpl
|
|||||||
private:
|
private:
|
||||||
Pos pos;
|
Pos pos;
|
||||||
Pos end;
|
Pos end;
|
||||||
|
std::optional<size_t> max_splits;
|
||||||
|
size_t splits;
|
||||||
|
SplitTokenMode split_token_mode;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
static constexpr auto name = "alphaTokens";
|
static constexpr auto name = "alphaTokens";
|
||||||
@ -87,15 +97,19 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 0uz;
|
static constexpr auto strings_argument_position = 0uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional(1);
|
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
|
void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_)
|
||||||
|
{
|
||||||
|
split_token_mode = split_token_mode_;
|
||||||
|
max_splits = extractMaxSplits(arguments, 1);
|
||||||
|
}
|
||||||
|
|
||||||
/// Called for each next string.
|
/// Called for each next string.
|
||||||
void set(Pos pos_, Pos end_)
|
void set(Pos pos_, Pos end_)
|
||||||
{
|
{
|
||||||
pos = pos_;
|
pos = pos_;
|
||||||
end = end_;
|
end = end_;
|
||||||
|
splits = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the next token, if any, or return false.
|
/// Get the next token, if any, or return false.
|
||||||
@ -110,10 +124,26 @@ public:
|
|||||||
|
|
||||||
token_begin = pos;
|
token_begin = pos;
|
||||||
|
|
||||||
|
if (max_splits && splits >= max_splits)
|
||||||
|
{
|
||||||
|
switch (split_token_mode)
|
||||||
|
{
|
||||||
|
case SplitTokenMode::LikeSpark:
|
||||||
|
return false;
|
||||||
|
case SplitTokenMode::LikePython:
|
||||||
|
{
|
||||||
|
token_end = end;
|
||||||
|
pos = end;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
while (pos < end && isAlphaASCII(*pos))
|
while (pos < end && isAlphaASCII(*pos))
|
||||||
++pos;
|
++pos;
|
||||||
|
|
||||||
token_end = pos;
|
token_end = pos;
|
||||||
|
++splits;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -124,6 +154,9 @@ class SplitByNonAlphaImpl
|
|||||||
private:
|
private:
|
||||||
Pos pos;
|
Pos pos;
|
||||||
Pos end;
|
Pos end;
|
||||||
|
std::optional<size_t> max_splits;
|
||||||
|
size_t splits;
|
||||||
|
SplitTokenMode split_token_mode;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/// Get the name of the function.
|
/// Get the name of the function.
|
||||||
@ -139,15 +172,19 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 0uz;
|
static constexpr auto strings_argument_position = 0uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional(1);
|
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
|
void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_)
|
||||||
|
{
|
||||||
|
split_token_mode = split_token_mode_;
|
||||||
|
max_splits = extractMaxSplits(arguments, 1);
|
||||||
|
}
|
||||||
|
|
||||||
/// Called for each next string.
|
/// Called for each next string.
|
||||||
void set(Pos pos_, Pos end_)
|
void set(Pos pos_, Pos end_)
|
||||||
{
|
{
|
||||||
pos = pos_;
|
pos = pos_;
|
||||||
end = end_;
|
end = end_;
|
||||||
|
splits = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the next token, if any, or return false.
|
/// Get the next token, if any, or return false.
|
||||||
@ -162,10 +199,25 @@ public:
|
|||||||
|
|
||||||
token_begin = pos;
|
token_begin = pos;
|
||||||
|
|
||||||
|
if (max_splits && splits >= max_splits)
|
||||||
|
{
|
||||||
|
switch (split_token_mode)
|
||||||
|
{
|
||||||
|
case SplitTokenMode::LikeSpark:
|
||||||
|
return false;
|
||||||
|
case SplitTokenMode::LikePython:
|
||||||
|
{
|
||||||
|
token_end = end;
|
||||||
|
pos = end;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
|
while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
|
||||||
++pos;
|
++pos;
|
||||||
|
|
||||||
token_end = pos;
|
token_end = pos;
|
||||||
|
splits++;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -176,6 +228,9 @@ class SplitByWhitespaceImpl
|
|||||||
private:
|
private:
|
||||||
Pos pos;
|
Pos pos;
|
||||||
Pos end;
|
Pos end;
|
||||||
|
std::optional<size_t> max_splits;
|
||||||
|
size_t splits;
|
||||||
|
SplitTokenMode split_token_mode;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
static constexpr auto name = "splitByWhitespace";
|
static constexpr auto name = "splitByWhitespace";
|
||||||
@ -190,15 +245,19 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 0uz;
|
static constexpr auto strings_argument_position = 0uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional(1);
|
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
|
void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_)
|
||||||
|
{
|
||||||
|
split_token_mode = split_token_mode_;
|
||||||
|
max_splits = extractMaxSplits(arguments, 1);
|
||||||
|
}
|
||||||
|
|
||||||
/// Called for each next string.
|
/// Called for each next string.
|
||||||
void set(Pos pos_, Pos end_)
|
void set(Pos pos_, Pos end_)
|
||||||
{
|
{
|
||||||
pos = pos_;
|
pos = pos_;
|
||||||
end = end_;
|
end = end_;
|
||||||
|
splits = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the next token, if any, or return false.
|
/// Get the next token, if any, or return false.
|
||||||
@ -213,10 +272,26 @@ public:
|
|||||||
|
|
||||||
token_begin = pos;
|
token_begin = pos;
|
||||||
|
|
||||||
|
if (max_splits && splits >= max_splits)
|
||||||
|
{
|
||||||
|
switch (split_token_mode)
|
||||||
|
{
|
||||||
|
case SplitTokenMode::LikeSpark:
|
||||||
|
return false;
|
||||||
|
case SplitTokenMode::LikePython:
|
||||||
|
{
|
||||||
|
token_end = end;
|
||||||
|
pos = end;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
while (pos < end && !isWhitespaceASCII(*pos))
|
while (pos < end && !isWhitespaceASCII(*pos))
|
||||||
++pos;
|
++pos;
|
||||||
|
|
||||||
token_end = pos;
|
token_end = pos;
|
||||||
|
splits++;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -228,6 +303,9 @@ private:
|
|||||||
Pos pos;
|
Pos pos;
|
||||||
Pos end;
|
Pos end;
|
||||||
char separator;
|
char separator;
|
||||||
|
std::optional<size_t> max_splits;
|
||||||
|
size_t splits;
|
||||||
|
SplitTokenMode split_token_mode;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
static constexpr auto name = "splitByChar";
|
static constexpr auto name = "splitByChar";
|
||||||
@ -250,9 +328,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 1uz;
|
static constexpr auto strings_argument_position = 1uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional(2);
|
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & arguments)
|
void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_)
|
||||||
{
|
{
|
||||||
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
||||||
|
|
||||||
@ -266,12 +343,16 @@ public:
|
|||||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal separator for function {}. Must be exactly one byte.", getName());
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal separator for function {}. Must be exactly one byte.", getName());
|
||||||
|
|
||||||
separator = sep_str[0];
|
separator = sep_str[0];
|
||||||
|
|
||||||
|
split_token_mode = split_token_mode_;
|
||||||
|
max_splits = extractMaxSplits(arguments, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
void set(Pos pos_, Pos end_)
|
void set(Pos pos_, Pos end_)
|
||||||
{
|
{
|
||||||
pos = pos_;
|
pos = pos_;
|
||||||
end = end_;
|
end = end_;
|
||||||
|
splits = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool get(Pos & token_begin, Pos & token_end)
|
bool get(Pos & token_begin, Pos & token_end)
|
||||||
@ -280,12 +361,28 @@ public:
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
token_begin = pos;
|
token_begin = pos;
|
||||||
pos = reinterpret_cast<Pos>(memchr(pos, separator, end - pos));
|
|
||||||
|
|
||||||
|
if (max_splits && splits >= max_splits)
|
||||||
|
{
|
||||||
|
switch (split_token_mode)
|
||||||
|
{
|
||||||
|
case SplitTokenMode::LikeSpark:
|
||||||
|
return false;
|
||||||
|
case SplitTokenMode::LikePython:
|
||||||
|
{
|
||||||
|
token_end = end;
|
||||||
|
pos = nullptr;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pos = reinterpret_cast<Pos>(memchr(pos, separator, end - pos));
|
||||||
if (pos)
|
if (pos)
|
||||||
{
|
{
|
||||||
token_end = pos;
|
token_end = pos;
|
||||||
++pos;
|
++pos;
|
||||||
|
++splits;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
token_end = end;
|
token_end = end;
|
||||||
@ -300,8 +397,10 @@ class SplitByStringImpl
|
|||||||
private:
|
private:
|
||||||
Pos pos;
|
Pos pos;
|
||||||
Pos end;
|
Pos end;
|
||||||
|
|
||||||
String separator;
|
String separator;
|
||||||
|
std::optional<size_t> max_splits;
|
||||||
|
size_t splits;
|
||||||
|
SplitTokenMode split_token_mode;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
static constexpr auto name = "splitByString";
|
static constexpr auto name = "splitByString";
|
||||||
@ -315,9 +414,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 1uz;
|
static constexpr auto strings_argument_position = 1uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional(2);
|
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & arguments)
|
void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_)
|
||||||
{
|
{
|
||||||
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
||||||
|
|
||||||
@ -326,6 +424,9 @@ public:
|
|||||||
"Must be constant string.", arguments[0].column->getName(), getName());
|
"Must be constant string.", arguments[0].column->getName(), getName());
|
||||||
|
|
||||||
separator = col->getValue<String>();
|
separator = col->getValue<String>();
|
||||||
|
|
||||||
|
split_token_mode = split_token_mode_;
|
||||||
|
max_splits = extractMaxSplits(arguments, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Called for each next string.
|
/// Called for each next string.
|
||||||
@ -333,6 +434,7 @@ public:
|
|||||||
{
|
{
|
||||||
pos = pos_;
|
pos = pos_;
|
||||||
end = end_;
|
end = end_;
|
||||||
|
splits = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the next token, if any, or return false.
|
/// Get the next token, if any, or return false.
|
||||||
@ -344,8 +446,25 @@ public:
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
token_begin = pos;
|
token_begin = pos;
|
||||||
|
|
||||||
|
if (max_splits && splits >= max_splits)
|
||||||
|
{
|
||||||
|
switch (split_token_mode)
|
||||||
|
{
|
||||||
|
case SplitTokenMode::LikeSpark:
|
||||||
|
return false;
|
||||||
|
case SplitTokenMode::LikePython:
|
||||||
|
{
|
||||||
|
token_end = end;
|
||||||
|
pos = end;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pos += 1;
|
pos += 1;
|
||||||
token_end = pos;
|
token_end = pos;
|
||||||
|
++splits;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -354,8 +473,22 @@ public:
|
|||||||
|
|
||||||
token_begin = pos;
|
token_begin = pos;
|
||||||
|
|
||||||
pos = reinterpret_cast<Pos>(memmem(pos, end - pos, separator.data(), separator.size()));
|
if (max_splits && splits >= max_splits)
|
||||||
|
{
|
||||||
|
switch (split_token_mode)
|
||||||
|
{
|
||||||
|
case SplitTokenMode::LikeSpark:
|
||||||
|
return false;
|
||||||
|
case SplitTokenMode::LikePython:
|
||||||
|
{
|
||||||
|
token_end = end;
|
||||||
|
pos = nullptr;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pos = reinterpret_cast<Pos>(memmem(pos, end - pos, separator.data(), separator.size()));
|
||||||
if (pos)
|
if (pos)
|
||||||
{
|
{
|
||||||
token_end = pos;
|
token_end = pos;
|
||||||
@ -363,6 +496,7 @@ public:
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
token_end = end;
|
token_end = end;
|
||||||
|
++splits;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -378,6 +512,10 @@ private:
|
|||||||
Pos pos;
|
Pos pos;
|
||||||
Pos end;
|
Pos end;
|
||||||
|
|
||||||
|
std::optional<size_t> max_splits;
|
||||||
|
size_t splits;
|
||||||
|
SplitTokenMode split_token_mode;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
static constexpr auto name = "splitByRegexp";
|
static constexpr auto name = "splitByRegexp";
|
||||||
static String getName() { return name; }
|
static String getName() { return name; }
|
||||||
@ -391,9 +529,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 1uz;
|
static constexpr auto strings_argument_position = 1uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional(2);
|
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & arguments)
|
void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode split_token_mode_)
|
||||||
{
|
{
|
||||||
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
||||||
|
|
||||||
@ -403,6 +540,9 @@ public:
|
|||||||
|
|
||||||
if (!col->getValue<String>().empty())
|
if (!col->getValue<String>().empty())
|
||||||
re = std::make_shared<OptimizedRegularExpression>(Regexps::createRegexp<false, false, false>(col->getValue<String>()));
|
re = std::make_shared<OptimizedRegularExpression>(Regexps::createRegexp<false, false, false>(col->getValue<String>()));
|
||||||
|
|
||||||
|
split_token_mode = split_token_mode_;
|
||||||
|
max_splits = extractMaxSplits(arguments, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Called for each next string.
|
/// Called for each next string.
|
||||||
@ -410,6 +550,7 @@ public:
|
|||||||
{
|
{
|
||||||
pos = pos_;
|
pos = pos_;
|
||||||
end = end_;
|
end = end_;
|
||||||
|
splits = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the next token, if any, or return false.
|
/// Get the next token, if any, or return false.
|
||||||
@ -421,8 +562,25 @@ public:
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
token_begin = pos;
|
token_begin = pos;
|
||||||
|
|
||||||
|
if (max_splits && splits >= max_splits)
|
||||||
|
{
|
||||||
|
switch (split_token_mode)
|
||||||
|
{
|
||||||
|
case SplitTokenMode::LikeSpark:
|
||||||
|
return false;
|
||||||
|
case SplitTokenMode::LikePython:
|
||||||
|
{
|
||||||
|
token_end = end;
|
||||||
|
pos = end;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pos += 1;
|
pos += 1;
|
||||||
token_end = pos;
|
token_end = pos;
|
||||||
|
++splits;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -431,6 +589,21 @@ public:
|
|||||||
|
|
||||||
token_begin = pos;
|
token_begin = pos;
|
||||||
|
|
||||||
|
if (max_splits && splits >= max_splits)
|
||||||
|
{
|
||||||
|
switch (split_token_mode)
|
||||||
|
{
|
||||||
|
case SplitTokenMode::LikeSpark:
|
||||||
|
return false;
|
||||||
|
case SplitTokenMode::LikePython:
|
||||||
|
{
|
||||||
|
token_end = end;
|
||||||
|
pos = nullptr;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!re->match(pos, end - pos, matches) || !matches[0].length)
|
if (!re->match(pos, end - pos, matches) || !matches[0].length)
|
||||||
{
|
{
|
||||||
token_end = end;
|
token_end = end;
|
||||||
@ -441,6 +614,7 @@ public:
|
|||||||
token_end = pos + matches[0].offset;
|
token_end = pos + matches[0].offset;
|
||||||
pos = token_end + matches[0].length;
|
pos = token_end + matches[0].length;
|
||||||
}
|
}
|
||||||
|
++splits;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -473,9 +647,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 0uz;
|
static constexpr auto strings_argument_position = 0uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional<size_t>();
|
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & arguments)
|
void init(const ColumnsWithTypeAndName & arguments, SplitTokenMode /*split_token_mode*/)
|
||||||
{
|
{
|
||||||
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
|
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
|
||||||
|
|
||||||
@ -527,9 +700,18 @@ public:
|
|||||||
template <typename Generator>
|
template <typename Generator>
|
||||||
class FunctionTokens : public IFunction
|
class FunctionTokens : public IFunction
|
||||||
{
|
{
|
||||||
|
private:
|
||||||
|
SplitTokenMode split_token_mode;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
static constexpr auto name = Generator::name;
|
static constexpr auto name = Generator::name;
|
||||||
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionTokens>(); }
|
static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionTokens>(context); }
|
||||||
|
|
||||||
|
explicit FunctionTokens<Generator>(ContextPtr context)
|
||||||
|
{
|
||||||
|
const Settings & settings = context->getSettingsRef();
|
||||||
|
split_token_mode = settings.split_tokens_like_python ? SplitTokenMode::LikePython : SplitTokenMode::LikeSpark;
|
||||||
|
}
|
||||||
|
|
||||||
String getName() const override { return name; }
|
String getName() const override { return name; }
|
||||||
|
|
||||||
@ -549,14 +731,10 @@ public:
|
|||||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
|
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
|
||||||
{
|
{
|
||||||
Generator generator;
|
Generator generator;
|
||||||
generator.init(arguments);
|
generator.init(arguments, split_token_mode);
|
||||||
|
|
||||||
const auto & array_argument = arguments[generator.strings_argument_position];
|
const auto & array_argument = arguments[generator.strings_argument_position];
|
||||||
|
|
||||||
/// Whether we need to limit max tokens returned by Generator::get
|
|
||||||
/// If max_substrings is std::nullopt, no limit is applied.
|
|
||||||
auto max_substrings = getMaxSubstrings(arguments);
|
|
||||||
|
|
||||||
const ColumnString * col_str = checkAndGetColumn<ColumnString>(array_argument.column.get());
|
const ColumnString * col_str = checkAndGetColumn<ColumnString>(array_argument.column.get());
|
||||||
const ColumnConst * col_str_const = checkAndGetColumnConstStringOrFixedString(array_argument.column.get());
|
const ColumnConst * col_str_const = checkAndGetColumnConstStringOrFixedString(array_argument.column.get());
|
||||||
|
|
||||||
@ -592,7 +770,7 @@ public:
|
|||||||
|
|
||||||
generator.set(pos, end);
|
generator.set(pos, end);
|
||||||
size_t j = 0;
|
size_t j = 0;
|
||||||
while (generator.get(token_begin, token_end) && !(max_substrings && j >= *max_substrings))
|
while (generator.get(token_begin, token_end))
|
||||||
{
|
{
|
||||||
size_t token_size = token_end - token_begin;
|
size_t token_size = token_end - token_begin;
|
||||||
|
|
||||||
@ -620,7 +798,7 @@ public:
|
|||||||
Pos token_begin = nullptr;
|
Pos token_begin = nullptr;
|
||||||
Pos token_end = nullptr;
|
Pos token_end = nullptr;
|
||||||
|
|
||||||
while (generator.get(token_begin, token_end) && !(max_substrings && dst.size() >= *max_substrings))
|
while (generator.get(token_begin, token_end))
|
||||||
dst.push_back(String(token_begin, token_end - token_begin));
|
dst.push_back(String(token_begin, token_end - token_begin));
|
||||||
|
|
||||||
return result_type->createColumnConst(col_str_const->size(), dst);
|
return result_type->createColumnConst(col_str_const->size(), dst);
|
||||||
@ -629,47 +807,6 @@ public:
|
|||||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns {}, {} of arguments of function {}",
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns {}, {} of arguments of function {}",
|
||||||
array_argument.column->getName(), array_argument.column->getName(), getName());
|
array_argument.column->getName(), array_argument.column->getName(), getName());
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
template <typename DataType>
|
|
||||||
std::optional<Int64> getMaxSubstringsImpl(const ColumnWithTypeAndName & argument) const
|
|
||||||
{
|
|
||||||
const auto * col = checkAndGetColumnConst<ColumnVector<DataType>>(argument.column.get());
|
|
||||||
if (!col)
|
|
||||||
return {};
|
|
||||||
|
|
||||||
auto value = col->template getValue<DataType>();
|
|
||||||
return static_cast<Int64>(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::optional<size_t> getMaxSubstrings(const ColumnsWithTypeAndName & arguments) const
|
|
||||||
{
|
|
||||||
const auto pos = Generator::max_substrings_argument_position;
|
|
||||||
if (!pos)
|
|
||||||
return std::nullopt;
|
|
||||||
|
|
||||||
if (*pos >= arguments.size())
|
|
||||||
return std::nullopt;
|
|
||||||
|
|
||||||
std::optional<Int64> max_substrings;
|
|
||||||
if (!((max_substrings = getMaxSubstringsImpl<UInt8>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int8>(arguments[*pos]))
|
|
||||||
|| (max_substrings = getMaxSubstringsImpl<UInt16>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int16>(arguments[*pos]))
|
|
||||||
|| (max_substrings = getMaxSubstringsImpl<UInt32>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int32>(arguments[*pos]))
|
|
||||||
|| (max_substrings = getMaxSubstringsImpl<UInt64>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int64>(arguments[*pos]))))
|
|
||||||
throw Exception(
|
|
||||||
ErrorCodes::ILLEGAL_COLUMN,
|
|
||||||
"Illegal column {}, which is {}-th argument of function {}",
|
|
||||||
arguments[*pos].column->getName(),
|
|
||||||
*pos + 1,
|
|
||||||
getName());
|
|
||||||
|
|
||||||
/// If max_substrings is negative or zero, tokenize will be applied as many times as possible, which is equivalent to
|
|
||||||
/// no max_substrings argument in function
|
|
||||||
if (max_substrings && *max_substrings <= 0)
|
|
||||||
return std::nullopt;
|
|
||||||
|
|
||||||
return max_substrings;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -33,9 +33,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 0uz;
|
static constexpr auto strings_argument_position = 0uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional<size_t>();
|
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
|
void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {}
|
||||||
|
|
||||||
/// Called for each next string.
|
/// Called for each next string.
|
||||||
void set(Pos pos_, Pos end_)
|
void set(Pos pos_, Pos end_)
|
||||||
|
@ -32,9 +32,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 0uz;
|
static constexpr auto strings_argument_position = 0uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional<size_t>();
|
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
|
void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {}
|
||||||
|
|
||||||
/// Called for each next string.
|
/// Called for each next string.
|
||||||
void set(Pos pos_, Pos end_)
|
void set(Pos pos_, Pos end_)
|
||||||
|
@ -32,9 +32,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 0uz;
|
static constexpr auto strings_argument_position = 0uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional<size_t>();
|
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
|
void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {}
|
||||||
|
|
||||||
/// Called for each next string.
|
/// Called for each next string.
|
||||||
void set(Pos pos_, Pos end_)
|
void set(Pos pos_, Pos end_)
|
||||||
|
@ -31,10 +31,9 @@ public:
|
|||||||
validateFunctionArgumentTypes(func, arguments, mandatory_args);
|
validateFunctionArgumentTypes(func, arguments, mandatory_args);
|
||||||
}
|
}
|
||||||
|
|
||||||
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
|
void init(const ColumnsWithTypeAndName & /*arguments*/, SplitTokenMode /*split_token_mode*/) {}
|
||||||
|
|
||||||
static constexpr auto strings_argument_position = 0uz;
|
static constexpr auto strings_argument_position = 0uz;
|
||||||
static constexpr auto max_substrings_argument_position = std::make_optional<size_t>();
|
|
||||||
|
|
||||||
/// Called for each next string.
|
/// Called for each next string.
|
||||||
void set(Pos pos_, Pos end_)
|
void set(Pos pos_, Pos end_)
|
||||||
|
@ -0,0 +1,22 @@
|
|||||||
|
splitByAlpha
|
||||||
|
['ab','cd']
|
||||||
|
['ab','cd','ef.gh']
|
||||||
|
splitByNonAlpha
|
||||||
|
['128','0']
|
||||||
|
['128','0','0.1']
|
||||||
|
splitByWhitespace
|
||||||
|
['Nein,','nein,']
|
||||||
|
['Nein,','nein,','nein! Doch!']
|
||||||
|
splitByChar
|
||||||
|
['a','b']
|
||||||
|
['a','b','c=d']
|
||||||
|
splitByString
|
||||||
|
['a','=']
|
||||||
|
['a','=','=b==c==d']
|
||||||
|
['a','b']
|
||||||
|
['a','b','c==d']
|
||||||
|
splitByRegexp
|
||||||
|
['a','1']
|
||||||
|
['a','1','2bc23de345f']
|
||||||
|
['a','bc']
|
||||||
|
['a','bc','de345f']
|
27
tests/queries/0_stateless/02876_splitby_spark_vs_python.sql
Normal file
27
tests/queries/0_stateless/02876_splitby_spark_vs_python.sql
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
SELECT 'splitByAlpha';
|
||||||
|
SELECT splitByAlpha('ab.cd.ef.gh', 2) settings split_tokens_like_python = 0;
|
||||||
|
SELECT splitByAlpha('ab.cd.ef.gh', 2) settings split_tokens_like_python = 1;
|
||||||
|
|
||||||
|
SELECT 'splitByNonAlpha';
|
||||||
|
SELECT splitByNonAlpha('128.0.0.1', 2) settings split_tokens_like_python = 0;
|
||||||
|
SELECT splitByNonAlpha('128.0.0.1', 2) settings split_tokens_like_python = 1;
|
||||||
|
|
||||||
|
SELECT 'splitByWhitespace';
|
||||||
|
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) settings split_tokens_like_python = 0;
|
||||||
|
SELECT splitByWhitespace('Nein, nein, nein! Doch!', 2) settings split_tokens_like_python = 1;
|
||||||
|
|
||||||
|
SELECT 'splitByChar';
|
||||||
|
SELECT splitByChar('=', 'a=b=c=d', 2) SETTINGS split_tokens_like_python = 0;
|
||||||
|
SELECT splitByChar('=', 'a=b=c=d', 2) SETTINGS split_tokens_like_python = 1;
|
||||||
|
|
||||||
|
SELECT 'splitByString';
|
||||||
|
SELECT splitByString('', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 0;
|
||||||
|
SELECT splitByString('', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 1;
|
||||||
|
SELECT splitByString('==', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 0;
|
||||||
|
SELECT splitByString('==', 'a==b==c==d', 2) SETTINGS split_tokens_like_python = 1;
|
||||||
|
|
||||||
|
SELECT 'splitByRegexp';
|
||||||
|
SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 0;
|
||||||
|
SELECT splitByRegexp('', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 1;
|
||||||
|
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 0;
|
||||||
|
SELECT splitByRegexp('\\d+', 'a12bc23de345f', 2) SETTINGS split_tokens_like_python = 1;
|
Loading…
Reference in New Issue
Block a user