diff --git a/src/Functions/normalizeQuery.cpp b/src/Functions/normalizeQuery.cpp new file mode 100644 index 00000000000..91c5b9d1680 --- /dev/null +++ b/src/Functions/normalizeQuery.cpp @@ -0,0 +1,171 @@ +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +struct Impl +{ + static void vector(const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + size_t size = offsets.size(); + res_offsets.resize(size); + res_data.reserve(data.size()); + + ColumnString::Offset prev_src_offset = 0; + for (size_t i = 0; i < size; ++i) + { + ColumnString::Offset curr_src_offset = offsets[i]; + Lexer lexer(reinterpret_cast(&data[prev_src_offset]), reinterpret_cast(&data[curr_src_offset - 1])); + prev_src_offset = offsets[i]; + + /// Coalesce whitespace characters and comments to a single whitespace. + bool prev_insignificant = false; + + /// Coalesce a list of comma separated literals to a single '?..' sequence. + size_t num_literals_in_sequence = 0; + bool prev_comma = false; + bool prev_whitespace = false; + + while (true) + { + Token token = lexer.nextToken(); + + if (!token.isSignificant()) + { + /// Replace a sequence of insignificant tokens with single whitespace. + if (!prev_insignificant) + { + if (0 == num_literals_in_sequence) + res_data.push_back(' '); + else + prev_whitespace = true; + } + prev_insignificant = true; + continue; + } + + prev_insignificant = false; + + /// Literals. + if (token.type == TokenType::Number || token.type == TokenType::StringLiteral) + { + if (0 == num_literals_in_sequence) + res_data.push_back('?'); + ++num_literals_in_sequence; + prev_whitespace = false; + prev_comma = false; + continue; + } + else if (token.type == TokenType::Comma) + { + if (num_literals_in_sequence) + { + prev_comma = true; + continue; + } + } + else + { + if (num_literals_in_sequence > 1) + { + res_data.push_back('.'); + res_data.push_back('.'); + } + + if (prev_comma) + res_data.push_back(','); + + if (prev_whitespace) + res_data.push_back(' '); + + num_literals_in_sequence = 0; + prev_comma = false; + prev_whitespace = false; + } + + /// Slightly normalize something that look like aliases - if they are complex, replace them to `?` placeholders. + if (token.type == TokenType::QuotedIdentifier + /// Differentiate identifier from function (example: SHA224(x)). + /// By the way, there is padding in columns and pointer dereference is Ok. + || (token.type == TokenType::BareWord && *token.end != '(')) + { + /// Identifier is complex if it contains whitespace or more than two digits. + size_t num_digits = 0; + + const char * pos = token.begin; + for (; pos != token.end; ++pos) + { + if (isWhitespaceASCII(*pos)) + break; + + if (isNumericASCII(*pos)) + { + ++num_digits; + if (num_digits > 2) + break; + } + } + + if (pos == token.end) + { + res_data.insert(token.begin, token.end); + } + else + { + res_data.push_back('`'); + res_data.push_back('?'); + res_data.push_back('`'); + } + + continue; + } + + if (token.isEnd() || token.isError()) + break; + + res_data.insert(token.begin, token.end); + } + + res_data.push_back(0); + res_offsets[i] = res_data.size(); + } + } + + [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) + { + throw Exception("Cannot apply function normalizeQuery to fixed string.", ErrorCodes::ILLEGAL_COLUMN); + } +}; + +struct Name +{ + static constexpr auto name = "normalizeQuery"; +}; + +} + +void registerFunctionNormalizeQuery(FunctionFactory & factory) +{ + factory.registerFunction>(); +} + +} + diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index ae09498ee99..06661b2fdce 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -30,6 +30,7 @@ void registerFunctionStartsWith(FunctionFactory &); void registerFunctionEndsWith(FunctionFactory &); void registerFunctionTrim(FunctionFactory &); void registerFunctionRegexpQuoteMeta(FunctionFactory &); +void registerFunctionNormalizeQuery(FunctionFactory &); #if USE_BASE64 void registerFunctionBase64Encode(FunctionFactory &); @@ -62,6 +63,7 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionEndsWith(factory); registerFunctionTrim(factory); registerFunctionRegexpQuoteMeta(factory); + registerFunctionNormalizeQuery(factory); #if USE_BASE64 registerFunctionBase64Encode(factory); registerFunctionBase64Decode(factory);