From 4fb23c25fb2ed632f16904684d9f21f6dcf7f3a3 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Sat, 19 Jun 2021 12:33:36 +0000 Subject: [PATCH] Added SplitByWhitespace & SplitByNonAlpha functions (new tokenize functions) --- src/Functions/FunctionsStringArray.cpp | 2 + src/Functions/FunctionsStringArray.h | 120 +++++++++++++++++++++++++ 2 files changed, 122 insertions(+) diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp index 14092d7dd3d..765317093c1 100644 --- a/src/Functions/FunctionsStringArray.cpp +++ b/src/Functions/FunctionsStringArray.cpp @@ -9,6 +9,8 @@ void registerFunctionsStringArray(FunctionFactory & factory) { factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 27f10797651..94ca6751fd9 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -32,6 +32,9 @@ namespace ErrorCodes * splitByChar(sep, s) * splitByString(sep, s) * splitByRegexp(regexp, s) + * + * splitByWhitespace(s) - split the string by whitespace characters + * splitByNonAlpha(s) - split the string by whitespace and punctuation characters * * extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp. * - first subpattern, if regexp has subpattern; @@ -111,6 +114,121 @@ public: } }; +class SplitByNonAlphaImpl +{ +private: + Pos pos; + Pos end; + +public: + /// Get the name of the function. + static constexpr auto name = "splitByNonAlpha"; + static String getName() { return name; } + + static size_t getNumberOfArguments() { return 1; } + + /// Check the type of the function's arguments. + static void checkArguments(const DataTypes & arguments) + { + if (!isString(arguments[0])) + throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.", + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + + /// Initialize by the function arguments. + void init(const ColumnsWithTypeAndName & /*arguments*/) {} + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + } + + /// Returns the position of the argument, that is the column of strings + size_t getStringsArgumentPosition() + { + return 0; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + /// Skip garbage + while (pos < end && (isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) + ++pos; + + if (pos == end) + return false; + + token_begin = pos; + + while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos))) + ++pos; + + token_end = pos; + + return true; + } +}; + +class SplitByWhitespaceImpl +{ +private: + Pos pos; + Pos end; + +public: + /// Get the name of the function. + static constexpr auto name = "splitByWhitespace"; + static String getName() { return name; } + + static size_t getNumberOfArguments() { return 1; } + + /// Check the type of the function's arguments. + static void checkArguments(const DataTypes & arguments) + { + if (!isString(arguments[0])) + throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.", + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + + /// Initialize by the function arguments. + void init(const ColumnsWithTypeAndName & /*arguments*/) {} + + /// Called for each next string. + void set(Pos pos_, Pos end_) + { + pos = pos_; + end = end_; + } + + /// Returns the position of the argument, that is the column of strings + size_t getStringsArgumentPosition() + { + return 0; + } + + /// Get the next token, if any, or return false. + bool get(Pos & token_begin, Pos & token_end) + { + /// Skip garbage + while (pos < end && isWhitespaceASCII(*pos)) + ++pos; + + if (pos == end) + return false; + + token_begin = pos; + + while (pos < end && !isWhitespaceASCII(*pos)) + ++pos; + + token_end = pos; + + return true; + } +}; class SplitByCharImpl { @@ -662,6 +780,8 @@ public: using FunctionAlphaTokens = FunctionTokens; +using FunctionSplitByNonAlpha = FunctionTokens; +using FunctionSplitByWhitespace = FunctionTokens; using FunctionSplitByChar = FunctionTokens; using FunctionSplitByString = FunctionTokens; using FunctionSplitByRegexp = FunctionTokens;