mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-13 01:41:59 +00:00
159 lines
4.0 KiB
C++
159 lines
4.0 KiB
C++
#include <Columns/ColumnConst.h>
|
|
#include <Functions/FunctionHelpers.h>
|
|
#include <Functions/FunctionTokens.h>
|
|
#include <Functions/FunctionFactory.h>
|
|
#include <Functions/Regexps.h>
|
|
#include <Common/StringUtils/StringUtils.h>
|
|
#include <Common/assert_cast.h>
|
|
|
|
|
|
namespace DB
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int ILLEGAL_COLUMN;
|
|
}
|
|
|
|
|
|
/** Functions that split strings into an array of strings or vice versa.
|
|
*
|
|
* splitByRegexp(regexp, s[, max_substrings])
|
|
*/
|
|
namespace
|
|
{
|
|
|
|
using Pos = const char *;
|
|
|
|
class SplitByRegexpImpl
|
|
{
|
|
private:
|
|
Regexps::RegexpPtr re;
|
|
OptimizedRegularExpression::MatchVec matches;
|
|
|
|
Pos pos;
|
|
Pos end;
|
|
|
|
std::optional<size_t> max_splits;
|
|
size_t splits;
|
|
bool max_substrings_includes_remaining_string;
|
|
|
|
public:
|
|
static constexpr auto name = "splitByRegexp";
|
|
|
|
static bool isVariadic() { return true; }
|
|
static size_t getNumberOfArguments() { return 0; }
|
|
|
|
static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {0, 2}; }
|
|
|
|
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
|
|
{
|
|
checkArgumentsWithSeparatorAndOptionalMaxSubstrings(func, arguments);
|
|
}
|
|
|
|
static constexpr auto strings_argument_position = 1uz;
|
|
|
|
void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
|
|
{
|
|
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
|
|
|
if (!col)
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. "
|
|
"Must be constant string.", arguments[0].column->getName(), name);
|
|
|
|
if (!col->getValue<String>().empty())
|
|
re = std::make_shared<OptimizedRegularExpression>(Regexps::createRegexp<false, false, false>(col->getValue<String>()));
|
|
|
|
max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
|
|
max_splits = extractMaxSplits(arguments, 2);
|
|
}
|
|
|
|
/// Called for each next string.
|
|
void set(Pos pos_, Pos end_)
|
|
{
|
|
pos = pos_;
|
|
end = end_;
|
|
splits = 0;
|
|
}
|
|
|
|
/// Get the next token, if any, or return false.
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
{
|
|
if (!re)
|
|
{
|
|
if (pos == end)
|
|
return false;
|
|
|
|
token_begin = pos;
|
|
|
|
if (max_splits)
|
|
{
|
|
if (max_substrings_includes_remaining_string)
|
|
{
|
|
if (splits == *max_splits - 1)
|
|
{
|
|
token_end = end;
|
|
pos = end;
|
|
return true;
|
|
}
|
|
}
|
|
else
|
|
if (splits == *max_splits)
|
|
return false;
|
|
}
|
|
|
|
pos += 1;
|
|
token_end = pos;
|
|
++splits;
|
|
}
|
|
else
|
|
{
|
|
if (!pos || pos > end)
|
|
return false;
|
|
|
|
token_begin = pos;
|
|
|
|
if (max_splits)
|
|
{
|
|
if (max_substrings_includes_remaining_string)
|
|
{
|
|
if (splits == *max_splits - 1)
|
|
{
|
|
token_end = end;
|
|
pos = nullptr;
|
|
return true;
|
|
}
|
|
}
|
|
else
|
|
if (splits == *max_splits)
|
|
return false;
|
|
}
|
|
|
|
if (!re->match(pos, end - pos, matches) || !matches[0].length)
|
|
{
|
|
token_end = end;
|
|
pos = end + 1;
|
|
}
|
|
else
|
|
{
|
|
token_end = pos + matches[0].offset;
|
|
pos = token_end + matches[0].length;
|
|
++splits;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
};
|
|
|
|
using FunctionSplitByRegexp = FunctionTokens<SplitByRegexpImpl>;
|
|
|
|
}
|
|
|
|
REGISTER_FUNCTION(SplitByRegexp)
|
|
{
|
|
factory.registerFunction<FunctionSplitByRegexp>();
|
|
}
|
|
|
|
}
|