Added SplitByWhitespace & SplitByNonAlpha functions (new tokenize functions)

This commit is contained in:
Nikolay Degterinsky 2021-06-19 12:33:36 +00:00
parent 9e77ca7182
commit 4fb23c25fb
2 changed files with 122 additions and 0 deletions

View File

@ -9,6 +9,8 @@ void registerFunctionsStringArray(FunctionFactory & factory)
{
factory.registerFunction<FunctionExtractAll>();
factory.registerFunction<FunctionAlphaTokens>();
factory.registerFunction<FunctionSplitByNonAlpha>();
factory.registerFunction<FunctionSplitByWhitespace>();
factory.registerFunction<FunctionSplitByChar>();
factory.registerFunction<FunctionSplitByString>();
factory.registerFunction<FunctionSplitByRegexp>();

View File

@ -32,6 +32,9 @@ namespace ErrorCodes
* splitByChar(sep, s)
* splitByString(sep, s)
* splitByRegexp(regexp, s)
*
* splitByWhitespace(s) - split the string by whitespace characters
* splitByNonAlpha(s) - split the string by whitespace and punctuation characters
*
* extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp.
* - first subpattern, if regexp has subpattern;
@ -111,6 +114,121 @@ public:
}
};
class SplitByNonAlphaImpl
{
private:
Pos pos;
Pos end;
public:
/// Get the name of the function.
static constexpr auto name = "splitByNonAlpha";
static String getName() { return name; }
static size_t getNumberOfArguments() { return 1; }
/// Check the type of the function's arguments.
static void checkArguments(const DataTypes & arguments)
{
if (!isString(arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
/// Initialize by the function arguments.
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
/// Called for each next string.
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
}
/// Returns the position of the argument, that is the column of strings
size_t getStringsArgumentPosition()
{
return 0;
}
/// Get the next token, if any, or return false.
bool get(Pos & token_begin, Pos & token_end)
{
/// Skip garbage
while (pos < end && (isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
++pos;
if (pos == end)
return false;
token_begin = pos;
while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
++pos;
token_end = pos;
return true;
}
};
class SplitByWhitespaceImpl
{
private:
Pos pos;
Pos end;
public:
/// Get the name of the function.
static constexpr auto name = "splitByWhitespace";
static String getName() { return name; }
static size_t getNumberOfArguments() { return 1; }
/// Check the type of the function's arguments.
static void checkArguments(const DataTypes & arguments)
{
if (!isString(arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
/// Initialize by the function arguments.
void init(const ColumnsWithTypeAndName & /*arguments*/) {}
/// Called for each next string.
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
}
/// Returns the position of the argument, that is the column of strings
size_t getStringsArgumentPosition()
{
return 0;
}
/// Get the next token, if any, or return false.
bool get(Pos & token_begin, Pos & token_end)
{
/// Skip garbage
while (pos < end && isWhitespaceASCII(*pos))
++pos;
if (pos == end)
return false;
token_begin = pos;
while (pos < end && !isWhitespaceASCII(*pos))
++pos;
token_end = pos;
return true;
}
};
class SplitByCharImpl
{
@ -662,6 +780,8 @@ public:
using FunctionAlphaTokens = FunctionTokens<AlphaTokensImpl>;
using FunctionSplitByNonAlpha = FunctionTokens<SplitByNonAlphaImpl>;
using FunctionSplitByWhitespace = FunctionTokens<SplitByWhitespaceImpl>;
using FunctionSplitByChar = FunctionTokens<SplitByCharImpl>;
using FunctionSplitByString = FunctionTokens<SplitByStringImpl>;
using FunctionSplitByRegexp = FunctionTokens<SplitByRegexpImpl>;