2012-09-17 04:34:19 +00:00
|
|
|
#pragma once
|
|
|
|
|
2021-10-12 21:23:40 +00:00
|
|
|
#include <Columns/ColumnArray.h>
|
|
|
|
#include <Columns/ColumnConst.h>
|
|
|
|
#include <Columns/ColumnFixedString.h>
|
2021-10-19 17:54:27 +00:00
|
|
|
#include <Columns/ColumnNullable.h>
|
2021-10-12 21:23:40 +00:00
|
|
|
#include <Columns/ColumnString.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <DataTypes/DataTypeArray.h>
|
2021-10-12 21:23:40 +00:00
|
|
|
#include <DataTypes/DataTypeNullable.h>
|
2019-02-10 17:40:52 +00:00
|
|
|
#include <DataTypes/DataTypeString.h>
|
2021-10-12 21:23:40 +00:00
|
|
|
#include <Functions/FunctionHelpers.h>
|
2021-10-22 20:19:32 +00:00
|
|
|
#include <Functions/FunctionsConversion.h>
|
2021-05-17 07:30:42 +00:00
|
|
|
#include <Functions/IFunction.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Functions/Regexps.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
2021-10-12 21:23:40 +00:00
|
|
|
#include <Interpreters/Context_fwd.h>
|
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
|
|
|
#include <Common/assert_cast.h>
|
|
|
|
#include <Common/typeid_cast.h>
|
2012-09-17 04:34:19 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2017-06-13 02:06:53 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2020-02-25 18:02:41 +00:00
|
|
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
2017-08-16 14:21:24 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2019-03-14 23:10:51 +00:00
|
|
|
extern const int ILLEGAL_COLUMN;
|
2022-01-30 01:10:27 +00:00
|
|
|
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
2017-06-13 02:06:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/** Functions that split strings into an array of strings or vice versa.
|
2012-09-17 04:34:19 +00:00
|
|
|
*
|
2022-11-03 08:12:19 +00:00
|
|
|
* splitByChar(sep, s[, max_substrings])
|
|
|
|
* splitByString(sep, s[, max_substrings])
|
|
|
|
* splitByRegexp(regexp, s[, max_substrings])
|
2021-06-19 18:52:09 +00:00
|
|
|
*
|
2022-11-03 08:12:19 +00:00
|
|
|
* splitByWhitespace(s[, max_substrings]) - split the string by whitespace characters
|
|
|
|
* splitByNonAlpha(s[, max_substrings]) - split the string by whitespace and punctuation characters
|
2012-09-23 05:28:59 +00:00
|
|
|
*
|
2017-05-27 15:45:25 +00:00
|
|
|
* extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp.
|
|
|
|
* - first subpattern, if regexp has subpattern;
|
|
|
|
* - zero subpattern (the match part, otherwise);
|
|
|
|
* - otherwise, an empty array
|
2013-08-02 16:31:23 +00:00
|
|
|
*
|
2015-10-19 21:07:24 +00:00
|
|
|
* arrayStringConcat(arr)
|
|
|
|
* arrayStringConcat(arr, delimiter)
|
2017-05-27 15:45:25 +00:00
|
|
|
* - join an array of strings into one string via a separator.
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2022-11-03 08:12:19 +00:00
|
|
|
* alphaTokens(s[, max_substrings]) - select from the string subsequence `[a-zA-Z]+`.
|
2014-06-26 00:58:14 +00:00
|
|
|
*
|
2017-05-27 15:45:25 +00:00
|
|
|
* URL functions are located separately.
|
2012-09-17 04:34:19 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
2016-05-28 10:35:44 +00:00
|
|
|
using Pos = const char *;
|
2012-09-17 04:34:19 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Substring generators. All of them have a common interface.
|
2012-09-23 05:28:59 +00:00
|
|
|
|
2022-11-03 07:06:58 +00:00
|
|
|
class SplitByAlphaImpl
|
2012-09-17 04:34:19 +00:00
|
|
|
{
|
2012-09-23 05:28:59 +00:00
|
|
|
private:
|
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
|
|
|
|
|
|
|
public:
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Get the name of the function.
|
2014-11-12 17:23:26 +00:00
|
|
|
static constexpr auto name = "alphaTokens";
|
|
|
|
static String getName() { return name; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
static bool isVariadic() { return true; }
|
2022-01-30 01:10:27 +00:00
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
static size_t getNumberOfArguments() { return 0; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Check the type of the function's arguments.
|
2012-09-23 05:28:59 +00:00
|
|
|
static void checkArguments(const DataTypes & arguments)
|
|
|
|
{
|
2018-09-07 14:37:26 +00:00
|
|
|
if (!isString(arguments[0]))
|
2012-09-23 05:28:59 +00:00
|
|
|
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
|
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Initialize by the function arguments.
|
2020-11-17 13:24:45 +00:00
|
|
|
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Called for each next string.
|
2012-09-23 05:28:59 +00:00
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
pos = pos_;
|
|
|
|
end = end_;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Returns the position of the argument, that is the column of strings
|
2022-03-12 18:05:50 +00:00
|
|
|
static size_t getStringsArgumentPosition()
|
2013-08-07 11:25:02 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
|
|
|
|
static std::optional<size_t> getMaxSubstringsArgumentPosition()
|
2022-10-25 08:38:36 +00:00
|
|
|
{
|
2022-11-03 07:05:23 +00:00
|
|
|
return 1;
|
2022-10-25 08:38:36 +00:00
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Get the next token, if any, or return false.
|
2012-09-23 05:28:59 +00:00
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
2016-08-16 21:23:53 +00:00
|
|
|
/// Skip garbage
|
|
|
|
while (pos < end && !isAlphaASCII(*pos))
|
2012-09-23 05:28:59 +00:00
|
|
|
++pos;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
if (pos == end)
|
|
|
|
return false;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
token_begin = pos;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-08-16 21:23:53 +00:00
|
|
|
while (pos < end && isAlphaASCII(*pos))
|
2012-09-23 05:28:59 +00:00
|
|
|
++pos;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
token_end = pos;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
2012-09-17 04:34:19 +00:00
|
|
|
|
2021-06-19 12:33:36 +00:00
|
|
|
class SplitByNonAlphaImpl
|
|
|
|
{
|
|
|
|
private:
|
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
|
|
|
|
|
|
|
public:
|
|
|
|
/// Get the name of the function.
|
|
|
|
static constexpr auto name = "splitByNonAlpha";
|
|
|
|
static String getName() { return name; }
|
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
static bool isVariadic() { return true; }
|
|
|
|
static size_t getNumberOfArguments() { return 0; }
|
2021-06-19 12:33:36 +00:00
|
|
|
|
|
|
|
/// Check the type of the function's arguments.
|
|
|
|
static void checkArguments(const DataTypes & arguments)
|
|
|
|
{
|
|
|
|
if (!isString(arguments[0]))
|
|
|
|
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
|
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Initialize by the function arguments.
|
|
|
|
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
|
|
|
|
|
|
|
|
/// Called for each next string.
|
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
pos = pos_;
|
|
|
|
end = end_;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns the position of the argument, that is the column of strings
|
2022-03-12 18:05:50 +00:00
|
|
|
static size_t getStringsArgumentPosition()
|
2021-06-19 12:33:36 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
|
|
|
|
static std::optional<size_t> getMaxSubstringsArgumentPosition()
|
2022-10-25 08:38:36 +00:00
|
|
|
{
|
2022-11-03 07:05:23 +00:00
|
|
|
return 1;
|
2022-10-25 08:38:36 +00:00
|
|
|
}
|
|
|
|
|
2021-06-19 12:33:36 +00:00
|
|
|
/// Get the next token, if any, or return false.
|
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
|
|
|
/// Skip garbage
|
|
|
|
while (pos < end && (isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
if (pos == end)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
token_begin = pos;
|
|
|
|
|
|
|
|
while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
token_end = pos;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
class SplitByWhitespaceImpl
|
|
|
|
{
|
|
|
|
private:
|
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
|
|
|
|
|
|
|
public:
|
|
|
|
/// Get the name of the function.
|
|
|
|
static constexpr auto name = "splitByWhitespace";
|
|
|
|
static String getName() { return name; }
|
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
static bool isVariadic() { return true; }
|
|
|
|
static size_t getNumberOfArguments() { return 0; }
|
2021-06-19 12:33:36 +00:00
|
|
|
|
|
|
|
/// Check the type of the function's arguments.
|
|
|
|
static void checkArguments(const DataTypes & arguments)
|
|
|
|
{
|
|
|
|
if (!isString(arguments[0]))
|
|
|
|
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
|
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Initialize by the function arguments.
|
|
|
|
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
|
|
|
|
|
|
|
|
/// Called for each next string.
|
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
pos = pos_;
|
|
|
|
end = end_;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns the position of the argument, that is the column of strings
|
2022-03-12 18:05:50 +00:00
|
|
|
static size_t getStringsArgumentPosition()
|
2021-06-19 12:33:36 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
|
|
|
|
static std::optional<size_t> getMaxSubstringsArgumentPosition()
|
2022-10-25 08:38:36 +00:00
|
|
|
{
|
2022-11-03 07:05:23 +00:00
|
|
|
return 1;
|
2022-10-25 08:38:36 +00:00
|
|
|
}
|
|
|
|
|
2021-06-19 12:33:36 +00:00
|
|
|
/// Get the next token, if any, or return false.
|
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
|
|
|
/// Skip garbage
|
|
|
|
while (pos < end && isWhitespaceASCII(*pos))
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
if (pos == end)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
token_begin = pos;
|
|
|
|
|
|
|
|
while (pos < end && !isWhitespaceASCII(*pos))
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
token_end = pos;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
2012-09-17 04:34:19 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
class SplitByCharImpl
|
2012-09-17 04:34:19 +00:00
|
|
|
{
|
2012-09-23 05:28:59 +00:00
|
|
|
private:
|
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
|
|
|
char sep;
|
|
|
|
|
2012-09-17 04:34:19 +00:00
|
|
|
public:
|
2014-11-12 17:23:26 +00:00
|
|
|
static constexpr auto name = "splitByChar";
|
|
|
|
static String getName() { return name; }
|
2022-01-30 01:10:27 +00:00
|
|
|
static bool isVariadic() { return true; }
|
|
|
|
static size_t getNumberOfArguments() { return 0; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
static void checkArguments(const DataTypes & arguments)
|
2012-09-17 04:34:19 +00:00
|
|
|
{
|
2022-01-30 01:10:27 +00:00
|
|
|
if (arguments.size() < 2 || arguments.size() > 3)
|
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
|
|
|
|
"Function '{}' needs at least 2 arguments, at most 3 arguments; passed {}.",
|
2022-03-28 09:48:17 +00:00
|
|
|
name, arguments.size());
|
2022-01-30 01:10:27 +00:00
|
|
|
|
2018-09-07 14:37:26 +00:00
|
|
|
if (!isString(arguments[0]))
|
2012-09-17 04:34:19 +00:00
|
|
|
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
|
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-09-07 14:37:26 +00:00
|
|
|
if (!isString(arguments[1]))
|
2012-09-23 05:28:59 +00:00
|
|
|
throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName() + ". Must be String.",
|
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-11-17 13:24:45 +00:00
|
|
|
void init(const ColumnsWithTypeAndName & arguments)
|
2012-09-23 05:28:59 +00:00
|
|
|
{
|
2020-10-18 19:00:13 +00:00
|
|
|
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
if (!col)
|
2020-10-18 19:00:13 +00:00
|
|
|
throw Exception("Illegal column " + arguments[0].column->getName()
|
2015-07-20 20:05:34 +00:00
|
|
|
+ " of first argument of function " + getName() + ". Must be constant string.",
|
2012-09-23 05:28:59 +00:00
|
|
|
ErrorCodes::ILLEGAL_COLUMN);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-07-21 06:35:58 +00:00
|
|
|
String sep_str = col->getValue<String>();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
if (sep_str.size() != 1)
|
2018-11-22 21:19:58 +00:00
|
|
|
throw Exception("Illegal separator for function " + getName() + ". Must be exactly one byte.", ErrorCodes::BAD_ARGUMENTS);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
sep = sep_str[0];
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Returns the position of the argument, that is the column of strings
|
2022-03-12 18:05:50 +00:00
|
|
|
static size_t getStringsArgumentPosition()
|
2013-08-07 11:25:02 +00:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
|
|
|
|
static std::optional<size_t> getMaxSubstringsArgumentPosition()
|
2022-10-25 08:38:36 +00:00
|
|
|
{
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
pos = pos_;
|
|
|
|
end = end_;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
|
|
|
if (!pos)
|
|
|
|
return false;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
token_begin = pos;
|
2022-02-05 11:30:40 +00:00
|
|
|
pos = reinterpret_cast<Pos>(memchr(pos, sep, end - pos));
|
2022-10-25 08:38:36 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
if (pos)
|
|
|
|
{
|
|
|
|
token_end = pos;
|
|
|
|
++pos;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
token_end = end;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
class SplitByStringImpl
|
|
|
|
{
|
|
|
|
private:
|
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
|
|
|
|
|
|
|
String sep;
|
|
|
|
|
|
|
|
public:
|
2014-11-12 17:23:26 +00:00
|
|
|
static constexpr auto name = "splitByString";
|
|
|
|
static String getName() { return name; }
|
2022-11-03 07:05:23 +00:00
|
|
|
static bool isVariadic() { return true; }
|
|
|
|
static size_t getNumberOfArguments() { return 0; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
static void checkArguments(const DataTypes & arguments)
|
|
|
|
{
|
|
|
|
SplitByCharImpl::checkArguments(arguments);
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-11-17 13:24:45 +00:00
|
|
|
void init(const ColumnsWithTypeAndName & arguments)
|
2012-09-23 05:28:59 +00:00
|
|
|
{
|
2020-10-18 19:00:13 +00:00
|
|
|
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
if (!col)
|
2020-10-18 19:00:13 +00:00
|
|
|
throw Exception("Illegal column " + arguments[0].column->getName()
|
2015-07-20 20:05:34 +00:00
|
|
|
+ " of first argument of function " + getName() + ". Must be constant string.",
|
2012-09-23 05:28:59 +00:00
|
|
|
ErrorCodes::ILLEGAL_COLUMN);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-07-21 06:35:58 +00:00
|
|
|
sep = col->getValue<String>();
|
2012-09-23 05:28:59 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Returns the position of the argument that is the column of strings
|
2022-03-12 18:05:50 +00:00
|
|
|
static size_t getStringsArgumentPosition()
|
2013-08-07 11:25:02 +00:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
|
|
|
|
static std::optional<size_t> getMaxSubstringsArgumentPosition()
|
2022-10-25 08:38:36 +00:00
|
|
|
{
|
2022-11-03 07:05:23 +00:00
|
|
|
return 2;
|
2022-10-25 08:38:36 +00:00
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Called for each next string.
|
2012-09-23 05:28:59 +00:00
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
pos = pos_;
|
|
|
|
end = end_;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Get the next token, if any, or return false.
|
2012-09-23 05:28:59 +00:00
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
2020-03-19 02:26:40 +00:00
|
|
|
if (sep.empty())
|
2012-09-23 05:28:59 +00:00
|
|
|
{
|
2020-03-20 05:37:25 +00:00
|
|
|
if (pos == end)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
token_begin = pos;
|
2020-03-19 02:26:40 +00:00
|
|
|
pos += 1;
|
2012-09-23 05:28:59 +00:00
|
|
|
token_end = pos;
|
|
|
|
}
|
|
|
|
else
|
2020-03-19 02:26:40 +00:00
|
|
|
{
|
2020-03-20 05:37:25 +00:00
|
|
|
if (!pos)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
token_begin = pos;
|
|
|
|
|
2020-03-19 02:26:40 +00:00
|
|
|
pos = reinterpret_cast<Pos>(memmem(pos, end - pos, sep.data(), sep.size()));
|
|
|
|
|
|
|
|
if (pos)
|
|
|
|
{
|
|
|
|
token_end = pos;
|
|
|
|
pos += sep.size();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
token_end = end;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2021-05-13 02:37:09 +00:00
|
|
|
class SplitByRegexpImpl
|
|
|
|
{
|
|
|
|
private:
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
Regexps::RegexpPtr re;
|
2021-05-13 02:37:09 +00:00
|
|
|
OptimizedRegularExpression::MatchVec matches;
|
|
|
|
|
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
2022-10-25 08:38:36 +00:00
|
|
|
|
2021-05-13 02:37:09 +00:00
|
|
|
public:
|
|
|
|
static constexpr auto name = "splitByRegexp";
|
|
|
|
static String getName() { return name; }
|
2022-01-30 01:10:27 +00:00
|
|
|
|
2022-10-25 08:38:36 +00:00
|
|
|
static bool isVariadic() { return true; }
|
|
|
|
static size_t getNumberOfArguments() { return 0; }
|
2021-05-13 02:37:09 +00:00
|
|
|
|
|
|
|
/// Check the type of function arguments.
|
|
|
|
static void checkArguments(const DataTypes & arguments)
|
|
|
|
{
|
|
|
|
SplitByStringImpl::checkArguments(arguments);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Initialize by the function arguments.
|
|
|
|
void init(const ColumnsWithTypeAndName & arguments)
|
|
|
|
{
|
|
|
|
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
|
|
|
|
|
|
|
if (!col)
|
|
|
|
throw Exception("Illegal column " + arguments[0].column->getName()
|
|
|
|
+ " of first argument of function " + getName() + ". Must be constant string.",
|
|
|
|
ErrorCodes::ILLEGAL_COLUMN);
|
|
|
|
|
2021-05-13 03:15:38 +00:00
|
|
|
if (!col->getValue<String>().empty())
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
re = std::make_shared<Regexps::Regexp>(Regexps::createRegexp<false, false, false>(col->getValue<String>()));
|
2021-05-13 02:37:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns the position of the argument that is the column of strings
|
2022-03-12 18:05:50 +00:00
|
|
|
static size_t getStringsArgumentPosition()
|
2021-05-13 02:37:09 +00:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
|
|
|
|
static std::optional<size_t> getMaxSubstringsArgumentPosition()
|
2022-10-25 08:38:36 +00:00
|
|
|
{
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
|
2021-05-13 02:37:09 +00:00
|
|
|
/// Called for each next string.
|
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
pos = pos_;
|
|
|
|
end = end_;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Get the next token, if any, or return false.
|
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
2021-05-13 03:15:38 +00:00
|
|
|
if (!re)
|
2021-05-13 02:37:09 +00:00
|
|
|
{
|
2021-05-13 03:15:38 +00:00
|
|
|
if (pos == end)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
token_begin = pos;
|
|
|
|
pos += 1;
|
|
|
|
token_end = pos;
|
2021-05-13 02:37:09 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2021-05-13 03:15:38 +00:00
|
|
|
if (!pos || pos > end)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
token_begin = pos;
|
|
|
|
|
|
|
|
if (!re->match(pos, end - pos, matches) || !matches[0].length)
|
|
|
|
{
|
|
|
|
token_end = end;
|
|
|
|
pos = end + 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
token_end = pos + matches[0].offset;
|
|
|
|
pos = token_end + matches[0].length;
|
|
|
|
}
|
2021-05-13 02:37:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2013-08-05 08:40:56 +00:00
|
|
|
class ExtractAllImpl
|
2013-08-02 13:09:42 +00:00
|
|
|
{
|
|
|
|
private:
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
Regexps::RegexpPtr re;
|
2013-08-02 13:09:42 +00:00
|
|
|
OptimizedRegularExpression::MatchVec matches;
|
2013-08-02 16:31:23 +00:00
|
|
|
size_t capture;
|
2013-08-02 13:09:42 +00:00
|
|
|
|
|
|
|
Pos pos;
|
|
|
|
Pos end;
|
|
|
|
public:
|
2014-11-12 17:23:26 +00:00
|
|
|
static constexpr auto name = "extractAll";
|
|
|
|
static String getName() { return name; }
|
2022-01-30 01:10:27 +00:00
|
|
|
static bool isVariadic() { return false; }
|
2016-12-29 19:38:10 +00:00
|
|
|
static size_t getNumberOfArguments() { return 2; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Check the type of function arguments.
|
2018-11-24 01:48:06 +00:00
|
|
|
static void checkArguments(const DataTypes & arguments)
|
2013-08-02 13:09:42 +00:00
|
|
|
{
|
|
|
|
SplitByStringImpl::checkArguments(arguments);
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Initialize by the function arguments.
|
2020-11-17 13:24:45 +00:00
|
|
|
void init(const ColumnsWithTypeAndName & arguments)
|
2013-08-02 13:09:42 +00:00
|
|
|
{
|
2020-10-18 19:00:13 +00:00
|
|
|
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2013-08-02 13:09:42 +00:00
|
|
|
if (!col)
|
2020-10-18 19:00:13 +00:00
|
|
|
throw Exception("Illegal column " + arguments[1].column->getName()
|
2015-07-20 20:05:34 +00:00
|
|
|
+ " of first argument of function " + getName() + ". Must be constant string.",
|
2013-08-02 13:09:42 +00:00
|
|
|
ErrorCodes::ILLEGAL_COLUMN);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
re = std::make_shared<Regexps::Regexp>(Regexps::createRegexp<false, false, false>(col->getValue<String>()));
|
2013-08-02 16:31:23 +00:00
|
|
|
capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2013-08-05 08:40:56 +00:00
|
|
|
matches.resize(capture + 1);
|
2013-08-02 13:09:42 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Returns the position of the argument that is the column of strings
|
2022-03-12 18:05:50 +00:00
|
|
|
static size_t getStringsArgumentPosition()
|
2013-08-07 11:25:02 +00:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
/// Returns the position of the possible max_substrings argument. std::nullopt means max_substrings argument is disabled in current function.
|
|
|
|
static std::optional<size_t> getMaxSubstringsArgumentPosition()
|
2022-10-25 08:38:36 +00:00
|
|
|
{
|
2022-11-03 07:05:23 +00:00
|
|
|
return std::nullopt;
|
2022-10-25 08:38:36 +00:00
|
|
|
}
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Called for each next string.
|
2013-08-02 13:09:42 +00:00
|
|
|
void set(Pos pos_, Pos end_)
|
|
|
|
{
|
|
|
|
pos = pos_;
|
|
|
|
end = end_;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Get the next token, if any, or return false.
|
2013-08-02 13:09:42 +00:00
|
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
|
|
{
|
2013-08-05 08:40:56 +00:00
|
|
|
if (!pos || pos > end)
|
2013-08-02 13:09:42 +00:00
|
|
|
return false;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-11-18 02:48:11 +00:00
|
|
|
if (!re->match(pos, end - pos, matches) || !matches[0].length)
|
2013-08-02 16:31:23 +00:00
|
|
|
return false;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-12-27 01:51:22 +00:00
|
|
|
if (matches[capture].offset == std::string::npos)
|
|
|
|
{
|
|
|
|
/// Empty match.
|
|
|
|
token_begin = pos;
|
|
|
|
token_end = pos;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
token_begin = pos + matches[capture].offset;
|
|
|
|
token_end = token_begin + matches[capture].length;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-11-18 02:48:11 +00:00
|
|
|
pos += matches[0].offset + matches[0].length;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2013-08-02 13:09:42 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
2012-09-23 05:28:59 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// A function that takes a string, and returns an array of substrings created by some generator.
|
2012-09-23 05:28:59 +00:00
|
|
|
template <typename Generator>
|
|
|
|
class FunctionTokens : public IFunction
|
|
|
|
{
|
|
|
|
public:
|
2014-11-12 17:23:26 +00:00
|
|
|
static constexpr auto name = Generator::name;
|
2021-06-01 12:20:52 +00:00
|
|
|
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionTokens>(); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2015-10-11 23:36:45 +00:00
|
|
|
String getName() const override
|
2012-09-23 05:28:59 +00:00
|
|
|
{
|
2014-11-12 17:23:26 +00:00
|
|
|
return name;
|
2012-09-23 05:28:59 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-06-22 16:21:23 +00:00
|
|
|
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
2021-04-29 14:48:26 +00:00
|
|
|
|
2022-01-30 01:10:27 +00:00
|
|
|
bool isVariadic() const override { return Generator::isVariadic(); }
|
|
|
|
|
2016-12-29 19:38:10 +00:00
|
|
|
size_t getNumberOfArguments() const override { return Generator::getNumberOfArguments(); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-07-06 09:47:55 +00:00
|
|
|
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
2012-09-23 05:28:59 +00:00
|
|
|
{
|
2013-03-26 19:30:23 +00:00
|
|
|
Generator::checkArguments(arguments);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
const auto max_substrings_pos = Generator::getMaxSubstringsArgumentPosition();
|
2022-11-03 08:12:19 +00:00
|
|
|
if (max_substrings_pos && *max_substrings_pos < arguments.size() && !isNativeInteger(arguments[*max_substrings_pos]))
|
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
|
|
|
"{}-th argument for function '{}' must be integer, got '{}' instead",
|
|
|
|
*max_substrings_pos + 1,
|
|
|
|
getName(),
|
|
|
|
arguments[*max_substrings_pos]->getName());
|
2022-10-25 08:38:36 +00:00
|
|
|
|
2016-05-28 07:48:40 +00:00
|
|
|
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
|
2012-09-17 04:34:19 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-11-17 13:24:45 +00:00
|
|
|
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
|
2012-09-17 04:34:19 +00:00
|
|
|
{
|
2013-03-26 19:30:23 +00:00
|
|
|
Generator generator;
|
2020-10-18 19:00:13 +00:00
|
|
|
generator.init(arguments);
|
|
|
|
const auto & array_argument = arguments[generator.getStringsArgumentPosition()];
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-10-25 08:38:36 +00:00
|
|
|
/// Whether we need to limit max tokens returned by Generator::get
|
2022-11-03 07:05:23 +00:00
|
|
|
/// If max_substrings is std::nullopt, no limit is applied.
|
|
|
|
auto max_substrings = getMaxSubstrings(arguments);
|
2022-10-25 08:38:36 +00:00
|
|
|
|
2020-10-18 19:00:13 +00:00
|
|
|
const ColumnString * col_str = checkAndGetColumn<ColumnString>(array_argument.column.get());
|
2017-07-21 06:35:58 +00:00
|
|
|
const ColumnConst * col_const_str =
|
2020-10-18 19:00:13 +00:00
|
|
|
checkAndGetColumnConstStringOrFixedString(array_argument.column.get());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-12-14 01:43:19 +00:00
|
|
|
auto col_res = ColumnArray::create(ColumnString::create());
|
2014-06-26 00:58:14 +00:00
|
|
|
ColumnString & res_strings = typeid_cast<ColumnString &>(col_res->getData());
|
2017-12-15 21:32:25 +00:00
|
|
|
ColumnArray::Offsets & res_offsets = col_res->getOffsets();
|
2018-11-25 00:08:50 +00:00
|
|
|
ColumnString::Chars & res_strings_chars = res_strings.getChars();
|
2017-12-15 21:32:25 +00:00
|
|
|
ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-17 04:34:19 +00:00
|
|
|
if (col_str)
|
|
|
|
{
|
2018-11-25 00:08:50 +00:00
|
|
|
const ColumnString::Chars & src_chars = col_str->getChars();
|
2017-12-15 21:32:25 +00:00
|
|
|
const ColumnString::Offsets & src_offsets = col_str->getOffsets();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-17 04:34:19 +00:00
|
|
|
res_offsets.reserve(src_offsets.size());
|
2017-05-27 15:45:25 +00:00
|
|
|
res_strings_offsets.reserve(src_offsets.size() * 5); /// Constant 5 - at random.
|
2012-09-17 04:34:19 +00:00
|
|
|
res_strings_chars.reserve(src_chars.size());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2014-04-08 07:31:51 +00:00
|
|
|
Pos token_begin = nullptr;
|
|
|
|
Pos token_end = nullptr;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-17 04:34:19 +00:00
|
|
|
size_t size = src_offsets.size();
|
2017-12-15 21:32:25 +00:00
|
|
|
ColumnString::Offset current_src_offset = 0;
|
|
|
|
ColumnArray::Offset current_dst_offset = 0;
|
|
|
|
ColumnString::Offset current_dst_strings_offset = 0;
|
2012-09-17 04:34:19 +00:00
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
Pos pos = reinterpret_cast<Pos>(&src_chars[current_src_offset]);
|
|
|
|
current_src_offset = src_offsets[i];
|
2012-09-23 05:28:59 +00:00
|
|
|
Pos end = reinterpret_cast<Pos>(&src_chars[current_src_offset]) - 1;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
generator.set(pos, end);
|
2012-09-17 04:34:19 +00:00
|
|
|
size_t j = 0;
|
2022-11-03 07:05:23 +00:00
|
|
|
while (generator.get(token_begin, token_end) && !(max_substrings && j >= *max_substrings))
|
2012-09-17 04:34:19 +00:00
|
|
|
{
|
|
|
|
size_t token_size = token_end - token_begin;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-17 04:34:19 +00:00
|
|
|
res_strings_chars.resize(res_strings_chars.size() + token_size + 1);
|
2016-04-15 00:33:21 +00:00
|
|
|
memcpySmallAllowReadWriteOverflow15(&res_strings_chars[current_dst_strings_offset], token_begin, token_size);
|
2013-10-31 19:24:22 +00:00
|
|
|
res_strings_chars[current_dst_strings_offset + token_size] = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-17 04:34:19 +00:00
|
|
|
current_dst_strings_offset += token_size + 1;
|
|
|
|
res_strings_offsets.push_back(current_dst_strings_offset);
|
|
|
|
++j;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-17 04:34:19 +00:00
|
|
|
current_dst_offset += j;
|
|
|
|
res_offsets.push_back(current_dst_offset);
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-10-18 19:00:13 +00:00
|
|
|
return col_res;
|
2012-09-17 04:34:19 +00:00
|
|
|
}
|
|
|
|
else if (col_const_str)
|
|
|
|
{
|
2017-07-21 06:35:58 +00:00
|
|
|
String src = col_const_str->getValue<String>();
|
2012-09-17 04:34:19 +00:00
|
|
|
Array dst;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2012-09-23 05:28:59 +00:00
|
|
|
generator.set(src.data(), src.data() + src.size());
|
2014-04-08 07:31:51 +00:00
|
|
|
Pos token_begin = nullptr;
|
|
|
|
Pos token_end = nullptr;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
while (generator.get(token_begin, token_end) && !(max_substrings && dst.size() >= *max_substrings))
|
2012-09-17 04:34:19 +00:00
|
|
|
dst.push_back(String(token_begin, token_end - token_begin));
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-10-18 19:00:13 +00:00
|
|
|
return result_type->createColumnConst(col_const_str->size(), dst);
|
2012-09-17 04:34:19 +00:00
|
|
|
}
|
|
|
|
else
|
2020-10-18 19:00:13 +00:00
|
|
|
throw Exception("Illegal columns " + array_argument.column->getName()
|
|
|
|
+ ", " + array_argument.column->getName()
|
2012-09-17 04:34:19 +00:00
|
|
|
+ " of arguments of function " + getName(),
|
|
|
|
ErrorCodes::ILLEGAL_COLUMN);
|
|
|
|
}
|
2022-10-25 08:38:36 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
template <typename DataType>
|
2022-11-03 07:05:23 +00:00
|
|
|
std::optional<Int64> getMaxSubstringsImpl(const ColumnWithTypeAndName & argument) const
|
2022-10-25 08:38:36 +00:00
|
|
|
{
|
|
|
|
const auto * col = checkAndGetColumnConst<ColumnVector<DataType>>(argument.column.get());
|
|
|
|
if (!col)
|
|
|
|
return {};
|
|
|
|
|
|
|
|
auto value = col->template getValue<DataType>();
|
|
|
|
return static_cast<Int64>(value);
|
|
|
|
}
|
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
std::optional<size_t> getMaxSubstrings(const ColumnsWithTypeAndName & arguments) const
|
2022-10-25 08:38:36 +00:00
|
|
|
{
|
2022-11-03 07:05:23 +00:00
|
|
|
const auto pos = Generator::getMaxSubstringsArgumentPosition();
|
2022-10-25 08:38:36 +00:00
|
|
|
if (!pos)
|
2022-11-03 08:12:19 +00:00
|
|
|
return std::nullopt;
|
2022-10-25 08:38:36 +00:00
|
|
|
|
2022-11-03 08:12:19 +00:00
|
|
|
if (*pos >= arguments.size())
|
|
|
|
return std::nullopt;
|
2022-10-25 08:38:36 +00:00
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
std::optional<Int64> max_substrings;
|
2022-11-03 08:12:19 +00:00
|
|
|
if (!((max_substrings = getMaxSubstringsImpl<UInt8>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int8>(arguments[*pos]))
|
|
|
|
|| (max_substrings = getMaxSubstringsImpl<UInt16>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int16>(arguments[*pos]))
|
|
|
|
|| (max_substrings = getMaxSubstringsImpl<UInt32>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int32>(arguments[*pos]))
|
|
|
|
|| (max_substrings = getMaxSubstringsImpl<UInt64>(arguments[*pos])) || (max_substrings = getMaxSubstringsImpl<Int64>(arguments[*pos]))))
|
2022-10-25 08:38:36 +00:00
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::ILLEGAL_COLUMN,
|
|
|
|
"Illegal column {}, which is {}-th argument of function {}",
|
|
|
|
arguments[*pos].column->getName(),
|
|
|
|
*pos + 1,
|
|
|
|
getName());
|
|
|
|
|
2022-11-03 07:05:23 +00:00
|
|
|
/// If max_substrings is negative or zero, tokenize will be applied as many times as possible, which is equivalent to
|
|
|
|
/// no max_substrings argument in function
|
|
|
|
if (max_substrings && *max_substrings <= 0)
|
2022-11-03 08:12:19 +00:00
|
|
|
return std::nullopt;
|
2022-11-03 07:05:23 +00:00
|
|
|
|
|
|
|
return *max_substrings;
|
2022-10-25 08:38:36 +00:00
|
|
|
}
|
2012-09-17 04:34:19 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2021-10-22 20:19:32 +00:00
|
|
|
/// Joins an array of type serializable to string into one string via a separator.
|
2015-10-19 18:49:52 +00:00
|
|
|
class FunctionArrayStringConcat : public IFunction
|
|
|
|
{
|
|
|
|
private:
|
2021-10-19 17:54:27 +00:00
|
|
|
static void executeInternal(
|
2018-11-25 00:08:50 +00:00
|
|
|
const ColumnString::Chars & src_chars,
|
2017-12-15 21:32:25 +00:00
|
|
|
const ColumnString::Offsets & src_string_offsets,
|
|
|
|
const ColumnArray::Offsets & src_array_offsets,
|
2021-10-19 17:54:27 +00:00
|
|
|
const char * delimiter,
|
|
|
|
const size_t delimiter_size,
|
2018-11-25 00:08:50 +00:00
|
|
|
ColumnString::Chars & dst_chars,
|
2021-10-19 17:54:27 +00:00
|
|
|
ColumnString::Offsets & dst_string_offsets,
|
|
|
|
const char8_t * null_map)
|
2015-10-19 18:49:52 +00:00
|
|
|
{
|
|
|
|
size_t size = src_array_offsets.size();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2015-10-19 18:49:52 +00:00
|
|
|
if (!size)
|
|
|
|
return;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// With a small margin - as if the separator goes after the last string of the array.
|
2015-10-19 18:49:52 +00:00
|
|
|
dst_chars.resize(
|
|
|
|
src_chars.size()
|
2017-05-27 15:45:25 +00:00
|
|
|
+ delimiter_size * src_string_offsets.size() /// Separators after each string...
|
|
|
|
+ src_array_offsets.size() /// Zero byte after each joined string
|
|
|
|
- src_string_offsets.size()); /// The former zero byte after each string of the array
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// There will be as many strings as there were arrays.
|
2015-10-19 18:49:52 +00:00
|
|
|
dst_string_offsets.resize(src_array_offsets.size());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-12-15 21:32:25 +00:00
|
|
|
ColumnArray::Offset current_src_array_offset = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-12-15 21:32:25 +00:00
|
|
|
ColumnString::Offset current_dst_string_offset = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Loop through the array of strings.
|
2015-10-19 18:49:52 +00:00
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
2021-10-19 17:54:27 +00:00
|
|
|
bool first_non_null = true;
|
2017-05-27 15:45:25 +00:00
|
|
|
/// Loop through the rows within the array. /// NOTE You can do everything in one copy, if the separator has a size of 1.
|
2015-10-19 18:49:52 +00:00
|
|
|
for (auto next_src_array_offset = src_array_offsets[i]; current_src_array_offset < next_src_array_offset; ++current_src_array_offset)
|
|
|
|
{
|
2021-10-19 17:54:27 +00:00
|
|
|
if (unlikely(null_map && null_map[current_src_array_offset]))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!first_non_null)
|
|
|
|
{
|
|
|
|
memcpy(&dst_chars[current_dst_string_offset], delimiter, delimiter_size);
|
|
|
|
current_dst_string_offset += delimiter_size;
|
|
|
|
}
|
|
|
|
first_non_null = false;
|
|
|
|
|
|
|
|
const auto current_src_string_offset = current_src_array_offset ? src_string_offsets[current_src_array_offset - 1] : 0;
|
2015-10-19 18:49:52 +00:00
|
|
|
size_t bytes_to_copy = src_string_offsets[current_src_array_offset] - current_src_string_offset - 1;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-04-15 00:33:21 +00:00
|
|
|
memcpySmallAllowReadWriteOverflow15(
|
|
|
|
&dst_chars[current_dst_string_offset], &src_chars[current_src_string_offset], bytes_to_copy);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2015-10-19 18:49:52 +00:00
|
|
|
current_dst_string_offset += bytes_to_copy;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2015-10-19 18:49:52 +00:00
|
|
|
dst_chars[current_dst_string_offset] = 0;
|
|
|
|
++current_dst_string_offset;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2015-10-19 18:49:52 +00:00
|
|
|
dst_string_offsets[i] = current_dst_string_offset;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2015-10-19 18:49:52 +00:00
|
|
|
dst_chars.resize(dst_string_offsets.back());
|
|
|
|
}
|
|
|
|
|
2021-10-19 17:54:27 +00:00
|
|
|
static void executeInternal(
|
|
|
|
const ColumnString & col_string,
|
|
|
|
const ColumnArray & col_arr,
|
|
|
|
const String & delimiter,
|
|
|
|
ColumnString & col_res,
|
|
|
|
const char8_t * null_map = nullptr)
|
|
|
|
{
|
|
|
|
executeInternal(
|
|
|
|
col_string.getChars(),
|
|
|
|
col_string.getOffsets(),
|
|
|
|
col_arr.getOffsets(),
|
|
|
|
delimiter.data(),
|
|
|
|
delimiter.size(),
|
|
|
|
col_res.getChars(),
|
|
|
|
col_res.getOffsets(),
|
|
|
|
null_map);
|
|
|
|
}
|
|
|
|
|
2021-10-22 20:19:32 +00:00
|
|
|
static ColumnPtr serializeNestedColumn(const ColumnArray & col_arr, const DataTypePtr & nested_type)
|
|
|
|
{
|
|
|
|
if (isString(nested_type))
|
|
|
|
{
|
|
|
|
return col_arr.getDataPtr();
|
|
|
|
}
|
|
|
|
else if (const ColumnNullable * col_nullable = checkAndGetColumn<ColumnNullable>(col_arr.getData());
|
|
|
|
col_nullable && isString(col_nullable->getNestedColumn().getDataType()))
|
|
|
|
{
|
|
|
|
return col_nullable->getNestedColumnPtr();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ColumnsWithTypeAndName cols;
|
|
|
|
cols.emplace_back(col_arr.getDataPtr(), nested_type, "tmp");
|
2021-12-15 17:40:36 +00:00
|
|
|
return ConvertImplGenericToString<ColumnString>::execute(cols, std::make_shared<DataTypeString>(), col_arr.size());
|
2021-10-22 20:19:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-19 18:49:52 +00:00
|
|
|
public:
|
|
|
|
static constexpr auto name = "arrayStringConcat";
|
2021-06-01 12:20:52 +00:00
|
|
|
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionArrayStringConcat>(); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2015-10-19 18:49:52 +00:00
|
|
|
String getName() const override
|
|
|
|
{
|
|
|
|
return name;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2016-12-29 19:38:10 +00:00
|
|
|
bool isVariadic() const override { return true; }
|
2021-06-22 16:21:23 +00:00
|
|
|
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
2016-12-29 19:38:10 +00:00
|
|
|
size_t getNumberOfArguments() const override { return 0; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-10-12 21:23:40 +00:00
|
|
|
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-11-17 13:24:45 +00:00
|
|
|
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
|
2015-10-19 18:49:52 +00:00
|
|
|
{
|
|
|
|
String delimiter;
|
|
|
|
if (arguments.size() == 2)
|
|
|
|
{
|
2020-10-18 19:00:13 +00:00
|
|
|
const ColumnConst * col_delim = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
|
2015-10-19 18:49:52 +00:00
|
|
|
if (!col_delim)
|
|
|
|
throw Exception("Second argument for function " + getName() + " must be constant string.", ErrorCodes::ILLEGAL_COLUMN);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-07-21 06:35:58 +00:00
|
|
|
delimiter = col_delim->getValue<String>();
|
2015-10-19 18:49:52 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-10-22 20:19:32 +00:00
|
|
|
const auto & nested_type = assert_cast<const DataTypeArray &>(*arguments[0].type).getNestedType();
|
|
|
|
if (const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arguments[0].column.get());
|
|
|
|
col_const_arr && isString(nested_type))
|
2015-10-19 18:49:52 +00:00
|
|
|
{
|
2017-07-21 06:35:58 +00:00
|
|
|
Array src_arr = col_const_arr->getValue<Array>();
|
|
|
|
String dst_str;
|
2021-10-12 21:23:40 +00:00
|
|
|
bool first_non_null = true;
|
2015-10-19 18:49:52 +00:00
|
|
|
for (size_t i = 0, size = src_arr.size(); i < size; ++i)
|
|
|
|
{
|
2021-10-12 21:23:40 +00:00
|
|
|
if (src_arr[i].isNull())
|
|
|
|
continue;
|
|
|
|
if (!first_non_null)
|
2015-10-19 18:49:52 +00:00
|
|
|
dst_str += delimiter;
|
2021-10-12 21:23:40 +00:00
|
|
|
first_non_null = false;
|
2015-10-19 18:49:52 +00:00
|
|
|
dst_str += src_arr[i].get<const String &>();
|
|
|
|
}
|
2017-07-21 06:35:58 +00:00
|
|
|
|
2020-10-18 19:00:13 +00:00
|
|
|
return result_type->createColumnConst(col_const_arr->size(), dst_str);
|
2015-10-19 18:49:52 +00:00
|
|
|
}
|
2021-10-22 20:19:32 +00:00
|
|
|
|
|
|
|
ColumnPtr src_column = arguments[0].column->convertToFullColumnIfConst();
|
|
|
|
const ColumnArray & col_arr = assert_cast<const ColumnArray &>(*src_column.get());
|
|
|
|
|
|
|
|
ColumnPtr str_subcolumn = serializeNestedColumn(col_arr, nested_type);
|
|
|
|
const ColumnString & col_string = assert_cast<const ColumnString &>(*str_subcolumn.get());
|
|
|
|
|
|
|
|
auto col_res = ColumnString::create();
|
|
|
|
if (const ColumnNullable * col_nullable = checkAndGetColumn<ColumnNullable>(col_arr.getData()))
|
|
|
|
executeInternal(col_string, col_arr, delimiter, *col_res, col_nullable->getNullMapData().data());
|
2015-10-19 18:49:52 +00:00
|
|
|
else
|
2021-10-22 20:19:32 +00:00
|
|
|
executeInternal(col_string, col_arr, delimiter, *col_res);
|
|
|
|
return col_res;
|
2015-10-19 18:49:52 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2022-11-03 07:06:58 +00:00
|
|
|
using FunctionSplitByAlpha = FunctionTokens<SplitByAlphaImpl>;
|
2021-06-19 12:33:36 +00:00
|
|
|
using FunctionSplitByNonAlpha = FunctionTokens<SplitByNonAlphaImpl>;
|
|
|
|
using FunctionSplitByWhitespace = FunctionTokens<SplitByWhitespaceImpl>;
|
2017-06-13 02:06:53 +00:00
|
|
|
using FunctionSplitByChar = FunctionTokens<SplitByCharImpl>;
|
2016-05-28 10:35:44 +00:00
|
|
|
using FunctionSplitByString = FunctionTokens<SplitByStringImpl>;
|
2021-05-13 02:37:09 +00:00
|
|
|
using FunctionSplitByRegexp = FunctionTokens<SplitByRegexpImpl>;
|
2017-06-13 02:06:53 +00:00
|
|
|
using FunctionExtractAll = FunctionTokens<ExtractAllImpl>;
|
2012-09-23 05:28:59 +00:00
|
|
|
|
2012-09-17 04:34:19 +00:00
|
|
|
}
|