diff --git a/src/Functions/normalizedQueryHash.cpp b/src/Functions/normalizedQueryHash.cpp new file mode 100644 index 00000000000..cc9636dc14c --- /dev/null +++ b/src/Functions/normalizedQueryHash.cpp @@ -0,0 +1,182 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/** The function returns 64bit hash value that is identical for similar queries. + * See also 'normalizeQuery'. This function is only slightly more efficient. + */ + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +struct Impl +{ + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + PaddedPODArray & res_data) + { + size_t size = offsets.size(); + res_data.resize(size); + + ColumnString::Offset prev_src_offset = 0; + for (size_t i = 0; i < size; ++i) + { + SipHash hash; + + ColumnString::Offset curr_src_offset = offsets[i]; + Lexer lexer(reinterpret_cast(&data[prev_src_offset]), reinterpret_cast(&data[curr_src_offset - 1])); + prev_src_offset = offsets[i]; + + /// Coalesce a list of comma separated literals. + size_t num_literals_in_sequence = 0; + bool prev_comma = false; + + while (true) + { + Token token = lexer.nextToken(); + + if (!token.isSignificant()) + continue; + + /// Literals. + if (token.type == TokenType::Number || token.type == TokenType::StringLiteral) + { + if (0 == num_literals_in_sequence) + hash.update("\x00", 1); + ++num_literals_in_sequence; + prev_comma = false; + continue; + } + else if (token.type == TokenType::Comma) + { + if (num_literals_in_sequence) + { + prev_comma = true; + continue; + } + } + else + { + if (num_literals_in_sequence > 1) + hash.update("\x00", 1); + + if (prev_comma) + hash.update(",", 1); + + num_literals_in_sequence = 0; + prev_comma = false; + } + + /// Slightly normalize something that look like aliases - if they are complex, replace them to `?` placeholders. + if (token.type == TokenType::QuotedIdentifier + /// Differentiate identifier from function (example: SHA224(x)). + /// By the way, there is padding in columns and pointer dereference is Ok. + || (token.type == TokenType::BareWord && *token.end != '(')) + { + /// Identifier is complex if it contains whitespace or more than two digits. + size_t num_digits = 0; + + const char * pos = token.begin; + for (; pos != token.end; ++pos) + { + if (isWhitespaceASCII(*pos)) + break; + + if (isNumericASCII(*pos)) + { + ++num_digits; + if (num_digits > 2) + break; + } + } + + if (pos == token.end) + hash.update(token.begin, token.size()); + else + hash.update("\x01", 1); + + continue; + } + + if (token.isEnd() || token.isError()) + break; + + hash.update(token.begin, token.size()); + } + + res_data[i] = hash.get64(); + } + } +}; + +class FunctionNormalizedQueryHash : public IFunction +{ +public: + static constexpr auto name = "normalizedQueryHash"; + static FunctionPtr create(const Context &) + { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override + { + return 1; + } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); + } + + bool useDefaultImplementationForConstants() const override { return true; } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) const override + { + const ColumnPtr column = block.getByPosition(arguments[0]).column; + if (const ColumnString * col = checkAndGetColumn(column.get())) + { + auto col_res = ColumnUInt64::create(); + typename ColumnUInt64::Container & vec_res = col_res->getData(); + vec_res.resize(col->size()); + Impl::vector(col->getChars(), col->getOffsets(), vec_res); + block.getByPosition(result).column = std::move(col_res); + } + else + throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } +}; + +} + +void registerFunctionNormalizedQueryHash(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index 06661b2fdce..5d4c165e1e3 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -31,6 +31,7 @@ void registerFunctionEndsWith(FunctionFactory &); void registerFunctionTrim(FunctionFactory &); void registerFunctionRegexpQuoteMeta(FunctionFactory &); void registerFunctionNormalizeQuery(FunctionFactory &); +void registerFunctionNormalizedQueryHash(FunctionFactory &); #if USE_BASE64 void registerFunctionBase64Encode(FunctionFactory &); @@ -64,6 +65,7 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionTrim(factory); registerFunctionRegexpQuoteMeta(factory); registerFunctionNormalizeQuery(factory); + registerFunctionNormalizedQueryHash(factory); #if USE_BASE64 registerFunctionBase64Encode(factory); registerFunctionBase64Decode(factory); diff --git a/tests/queries/0_stateless/01452_normalized_query_hash.reference b/tests/queries/0_stateless/01452_normalized_query_hash.reference new file mode 100644 index 00000000000..fcb49fa9945 --- /dev/null +++ b/tests/queries/0_stateless/01452_normalized_query_hash.reference @@ -0,0 +1,7 @@ +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/01452_normalized_query_hash.sql b/tests/queries/0_stateless/01452_normalized_query_hash.sql new file mode 100644 index 00000000000..a888d2b87b5 --- /dev/null +++ b/tests/queries/0_stateless/01452_normalized_query_hash.sql @@ -0,0 +1,7 @@ +SELECT normalizedQueryHash('SELECT 1') = normalizedQueryHash('SELECT 2'); +SELECT normalizedQueryHash('SELECT 1') != normalizedQueryHash('SELECT 1, 1, 1'); +SELECT normalizedQueryHash('SELECT 1, 1, 1, /* Hello */ \'abc\'') = normalizedQueryHash('SELECT 2, 3'); +SELECT normalizedQueryHash('[1, 2, 3]') = normalizedQueryHash('[1, ''x'']'); +SELECT normalizedQueryHash('[1, 2, 3, x]') != normalizedQueryHash('[1, x]'); +SELECT normalizedQueryHash('SELECT 1 AS `xyz`') != normalizedQueryHash('SELECT 1 AS `abc`'); +SELECT normalizedQueryHash('SELECT 1 AS xyz111') = normalizedQueryHash('SELECT 2 AS xyz234');