Merge pull request #13816 from ClickHouse/normalize-query

Add function "normalizeQuery"
2024-09-21 01:00:48 +00:00 · 2020-08-17 16:23:24 +03:00 · 2020-08-17 16:23:24 +03:00 · abf3cb8746
commit abf3cb8746
parent 695c2aafba 59b8153506
9 changed files with 418 additions and 0 deletions
--- a/src/Functions/CMakeLists.txt
+++ b/src/Functions/CMakeLists.txt
@ -34,6 +34,7 @@ target_link_libraries(clickhouse_functions
        ${FASTOPS_LIBRARY}
        clickhouse_dictionaries
        clickhouse_dictionaries_embedded
+        clickhouse_parsers
        consistent-hashing
        consistent-hashing-sumbur
        dbms
--- a/src/Functions/normalizeQuery.cpp
+++ b/src/Functions/normalizeQuery.cpp
@ -0,0 +1,171 @@
+#include <DataTypes/DataTypeString.h>
+#include <Columns/ColumnString.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionStringToString.h>
+#include <Parsers/Lexer.h>
+#include <common/find_symbols.h>
+#include <Common/StringUtils/StringUtils.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+}
+
+namespace
+{
+
+struct Impl
+{
+    static void vector(const ColumnString::Chars & data,
+        const ColumnString::Offsets & offsets,
+        ColumnString::Chars & res_data,
+        ColumnString::Offsets & res_offsets)
+    {
+        size_t size = offsets.size();
+        res_offsets.resize(size);
+        res_data.reserve(data.size());
+
+        ColumnString::Offset prev_src_offset = 0;
+        for (size_t i = 0; i < size; ++i)
+        {
+            ColumnString::Offset curr_src_offset = offsets[i];
+            Lexer lexer(reinterpret_cast<const char *>(&data[prev_src_offset]), reinterpret_cast<const char *>(&data[curr_src_offset - 1]));
+            prev_src_offset = offsets[i];
+
+            /// Coalesce whitespace characters and comments to a single whitespace.
+            bool prev_insignificant = false;
+
+            /// Coalesce a list of comma separated literals to a single '?..' sequence.
+            size_t num_literals_in_sequence = 0;
+            bool prev_comma = false;
+            bool prev_whitespace = false;
+
+            while (true)
+            {
+                Token token = lexer.nextToken();
+
+                if (!token.isSignificant())
+                {
+                    /// Replace a sequence of insignificant tokens with single whitespace.
+                    if (!prev_insignificant)
+                    {
+                        if (0 == num_literals_in_sequence)
+                            res_data.push_back(' ');
+                        else
+                            prev_whitespace = true;
+                    }
+                    prev_insignificant = true;
+                    continue;
+                }
+
+                prev_insignificant = false;
+
+                /// Literals.
+                if (token.type == TokenType::Number || token.type == TokenType::StringLiteral)
+                {
+                    if (0 == num_literals_in_sequence)
+                        res_data.push_back('?');
+                    ++num_literals_in_sequence;
+                    prev_whitespace = false;
+                    prev_comma = false;
+                    continue;
+                }
+                else if (token.type == TokenType::Comma)
+                {
+                    if (num_literals_in_sequence)
+                    {
+                        prev_comma = true;
+                        continue;
+                    }
+                }
+                else
+                {
+                    if (num_literals_in_sequence > 1)
+                    {
+                        res_data.push_back('.');
+                        res_data.push_back('.');
+                    }
+
+                    if (prev_comma)
+                        res_data.push_back(',');
+
+                    if (prev_whitespace)
+                        res_data.push_back(' ');
+
+                    num_literals_in_sequence = 0;
+                    prev_comma = false;
+                    prev_whitespace = false;
+                }
+
+                /// Slightly normalize something that look like aliases - if they are complex, replace them to `?` placeholders.
+                if (token.type == TokenType::QuotedIdentifier
+                    /// Differentiate identifier from function (example: SHA224(x)).
+                    /// By the way, there is padding in columns and pointer dereference is Ok.
+                    || (token.type == TokenType::BareWord && *token.end != '('))
+                {
+                    /// Identifier is complex if it contains whitespace or more than two digits.
+                    size_t num_digits = 0;
+
+                    const char * pos = token.begin;
+                    for (; pos != token.end; ++pos)
+                    {
+                        if (isWhitespaceASCII(*pos))
+                            break;
+
+                        if (isNumericASCII(*pos))
+                        {
+                            ++num_digits;
+                            if (num_digits > 2)
+                                break;
+                        }
+                    }
+
+                    if (pos == token.end)
+                    {
+                        res_data.insert(token.begin, token.end);
+                    }
+                    else
+                    {
+                        res_data.push_back('`');
+                        res_data.push_back('?');
+                        res_data.push_back('`');
+                    }
+
+                    continue;
+                }
+
+                if (token.isEnd() || token.isError())
+                    break;
+
+                res_data.insert(token.begin, token.end);
+            }
+
+            res_data.push_back(0);
+            res_offsets[i] = res_data.size();
+        }
+    }
+
+    [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
+    {
+        throw Exception("Cannot apply function normalizeQuery to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
+    }
+};
+
+struct Name
+{
+    static constexpr auto name = "normalizeQuery";
+};
+
+}
+
+void registerFunctionNormalizeQuery(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionStringToString<Impl, Name>>();
+}
+
+}
+
--- a/src/Functions/normalizedQueryHash.cpp
+++ b/src/Functions/normalizedQueryHash.cpp
@ -0,0 +1,183 @@
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnsNumber.h>
+#include <Functions/FunctionFactory.h>
+#include <Parsers/Lexer.h>
+#include <common/find_symbols.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <Common/SipHash.h>
+
+
+/** The function returns 64bit hash value that is identical for similar queries.
+  * See also 'normalizeQuery'. This function is only slightly more efficient.
+  */
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+struct Impl
+{
+    static void vector(
+        const ColumnString::Chars & data,
+        const ColumnString::Offsets & offsets,
+        PaddedPODArray<UInt64> & res_data)
+    {
+        size_t size = offsets.size();
+        res_data.resize(size);
+
+        ColumnString::Offset prev_src_offset = 0;
+        for (size_t i = 0; i < size; ++i)
+        {
+            SipHash hash;
+
+            ColumnString::Offset curr_src_offset = offsets[i];
+            Lexer lexer(reinterpret_cast<const char *>(&data[prev_src_offset]), reinterpret_cast<const char *>(&data[curr_src_offset - 1]));
+            prev_src_offset = offsets[i];
+
+            /// Coalesce a list of comma separated literals.
+            size_t num_literals_in_sequence = 0;
+            bool prev_comma = false;
+
+            while (true)
+            {
+                Token token = lexer.nextToken();
+
+                if (!token.isSignificant())
+                    continue;
+
+                /// Literals.
+                if (token.type == TokenType::Number || token.type == TokenType::StringLiteral)
+                {
+                    if (0 == num_literals_in_sequence)
+                        hash.update("\x00", 1);
+                    ++num_literals_in_sequence;
+                    prev_comma = false;
+                    continue;
+                }
+                else if (token.type == TokenType::Comma)
+                {
+                    if (num_literals_in_sequence)
+                    {
+                        prev_comma = true;
+                        continue;
+                    }
+                }
+                else
+                {
+                    if (num_literals_in_sequence > 1)
+                        hash.update("\x00", 1);
+
+                    if (prev_comma)
+                        hash.update(",", 1);
+
+                    num_literals_in_sequence = 0;
+                    prev_comma = false;
+                }
+
+                /// Slightly normalize something that look like aliases - if they are complex, replace them to `?` placeholders.
+                if (token.type == TokenType::QuotedIdentifier
+                    /// Differentiate identifier from function (example: SHA224(x)).
+                    /// By the way, there is padding in columns and pointer dereference is Ok.
+                    || (token.type == TokenType::BareWord && *token.end != '('))
+                {
+                    /// Identifier is complex if it contains whitespace or more than two digits.
+                    size_t num_digits = 0;
+
+                    const char * pos = token.begin;
+                    for (; pos != token.end; ++pos)
+                    {
+                        if (isWhitespaceASCII(*pos))
+                            break;
+
+                        if (isNumericASCII(*pos))
+                        {
+                            ++num_digits;
+                            if (num_digits > 2)
+                                break;
+                        }
+                    }
+
+                    if (pos == token.end)
+                        hash.update(token.begin, token.size());
+                    else
+                        hash.update("\x01", 1);
+
+                    continue;
+                }
+
+                if (token.isEnd() || token.isError())
+                    break;
+
+                hash.update(token.begin, token.size());
+            }
+
+            res_data[i] = hash.get64();
+        }
+    }
+};
+
+class FunctionNormalizedQueryHash : public IFunction
+{
+public:
+    static constexpr auto name = "normalizedQueryHash";
+    static FunctionPtr create(const Context &)
+    {
+        return std::make_shared<FunctionNormalizedQueryHash>();
+    }
+
+    String getName() const override
+    {
+        return name;
+    }
+
+    size_t getNumberOfArguments() const override
+    {
+        return 1;
+    }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (!isString(arguments[0]))
+            throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        return std::make_shared<DataTypeUInt64>();
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) const override
+    {
+        const ColumnPtr column = block.getByPosition(arguments[0]).column;
+        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
+        {
+            auto col_res = ColumnUInt64::create();
+            typename ColumnUInt64::Container & vec_res = col_res->getData();
+            vec_res.resize(col->size());
+            Impl::vector(col->getChars(), col->getOffsets(), vec_res);
+            block.getByPosition(result).column = std::move(col_res);
+        }
+        else
+            throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(),
+                ErrorCodes::ILLEGAL_COLUMN);
+    }
+};
+
+}
+
+void registerFunctionNormalizedQueryHash(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionNormalizedQueryHash>();
+}
+
+}
+
--- a/src/Functions/registerFunctionsString.cpp
+++ b/src/Functions/registerFunctionsString.cpp
@ -30,6 +30,8 @@ void registerFunctionStartsWith(FunctionFactory &);
 void registerFunctionEndsWith(FunctionFactory &);
 void registerFunctionTrim(FunctionFactory &);
 void registerFunctionRegexpQuoteMeta(FunctionFactory &);
+void registerFunctionNormalizeQuery(FunctionFactory &);
+void registerFunctionNormalizedQueryHash(FunctionFactory &);

 #if USE_BASE64
 void registerFunctionBase64Encode(FunctionFactory &);
@ -62,6 +64,8 @@ void registerFunctionsString(FunctionFactory & factory)
    registerFunctionEndsWith(factory);
    registerFunctionTrim(factory);
    registerFunctionRegexpQuoteMeta(factory);
+    registerFunctionNormalizeQuery(factory);
+    registerFunctionNormalizedQueryHash(factory);
 #if USE_BASE64
    registerFunctionBase64Encode(factory);
    registerFunctionBase64Decode(factory);
--- a/src/Functions/ya.make.in
+++ b/src/Functions/ya.make.in
@ -17,6 +17,7 @@ ADDINCL(

 PEERDIR(
    clickhouse/src/Common
+    clickhouse/src/Parsers
    clickhouse/src/Dictionaries
    contrib/libs/farmhash
    contrib/libs/fastops/fastops
--- a/tests/queries/0_stateless/01451_normalize_query.reference
+++ b/tests/queries/0_stateless/01451_normalize_query.reference
@ -0,0 +1,22 @@
+SELECT ?
+SELECT ?
+SELECT ?..
+SELECT ?.., 
+SELECT ?.., 
+SELECT ?..
+SELECT ?.. WHERE ?
+SELECT ?.. WHERE ? = ?
+SELECT ?.. WHERE ? = ? AND (x, y)
+SELECT ?.. WHERE ? = ? AND (?, y)
+[?..]
+[?.., x]
+SELECT ?.. WHERE ? = ? AND (?, y) LIMIT ?..
+SELECT ? AS `xyz`
+SELECT ? AS `xyz1`
+SELECT ? AS `xyz11`
+SELECT ? AS `?`
+SELECT ? AS xyz1
+SELECT ? AS xyz11
+SELECT ? xyz11
+SELECT ?, xyz11
+SELECT ?..
--- a/tests/queries/0_stateless/01451_normalize_query.sql
+++ b/tests/queries/0_stateless/01451_normalize_query.sql
@ -0,0 +1,22 @@
+SELECT normalizeQuery('SELECT 1');
+SELECT normalizeQuery('SELECT  1');
+SELECT normalizeQuery('SELECT  1, 1, 1');
+SELECT normalizeQuery('SELECT 1, 1, 1, /* Hwllo */');
+SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */');
+SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\'');
+SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\' WHERE 1');
+SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\' WHERE 1 = 1');
+SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\' WHERE 1 = 1 AND (x, y)');
+SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\' WHERE 1 = 1 AND (1, y)');
+SELECT normalizeQuery('[1, 2, 3]');
+SELECT normalizeQuery('[1, 2, 3, x]');
+SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\' WHERE 1 = 1 AND (1, y) LIMIT 1, 1');
+SELECT normalizeQuery('SELECT 1 AS `xyz`');
+SELECT normalizeQuery('SELECT 1 AS `xyz1`');
+SELECT normalizeQuery('SELECT 1 AS `xyz11`');
+SELECT normalizeQuery('SELECT 1 AS xyz111');
+SELECT normalizeQuery('SELECT 1 AS xyz1');
+SELECT normalizeQuery('SELECT 1 AS xyz11');
+SELECT normalizeQuery('SELECT 1 xyz11');
+SELECT normalizeQuery('SELECT 1, xyz11');
+SELECT normalizeQuery('SELECT 1, ''xyz11''');
--- a/tests/queries/0_stateless/01452_normalized_query_hash.reference
+++ b/tests/queries/0_stateless/01452_normalized_query_hash.reference
@ -0,0 +1,7 @@
+1
+1
+1
+1
+1
+1
+1
--- a/tests/queries/0_stateless/01452_normalized_query_hash.sql
+++ b/tests/queries/0_stateless/01452_normalized_query_hash.sql
@ -0,0 +1,7 @@
+SELECT normalizedQueryHash('SELECT 1') = normalizedQueryHash('SELECT 2');
+SELECT normalizedQueryHash('SELECT  1') != normalizedQueryHash('SELECT  1, 1, 1');
+SELECT normalizedQueryHash('SELECT 1, 1, 1, /* Hello */ \'abc\'') = normalizedQueryHash('SELECT 2, 3');
+SELECT normalizedQueryHash('[1, 2, 3]') = normalizedQueryHash('[1, ''x'']');
+SELECT normalizedQueryHash('[1, 2, 3, x]') != normalizedQueryHash('[1, x]');
+SELECT normalizedQueryHash('SELECT 1 AS `xyz`') != normalizedQueryHash('SELECT 1 AS `abc`');
+SELECT normalizedQueryHash('SELECT 1 AS xyz111') = normalizedQueryHash('SELECT 2 AS xyz234');