mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-21 01:00:48 +00:00
Merge pull request #13816 from ClickHouse/normalize-query
Add function "normalizeQuery"
This commit is contained in:
commit
abf3cb8746
@ -34,6 +34,7 @@ target_link_libraries(clickhouse_functions
|
||||
${FASTOPS_LIBRARY}
|
||||
clickhouse_dictionaries
|
||||
clickhouse_dictionaries_embedded
|
||||
clickhouse_parsers
|
||||
consistent-hashing
|
||||
consistent-hashing-sumbur
|
||||
dbms
|
||||
|
171
src/Functions/normalizeQuery.cpp
Normal file
171
src/Functions/normalizeQuery.cpp
Normal file
@ -0,0 +1,171 @@
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionStringToString.h>
|
||||
#include <Parsers/Lexer.h>
|
||||
#include <common/find_symbols.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
struct Impl
|
||||
{
|
||||
static void vector(const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
size_t size = offsets.size();
|
||||
res_offsets.resize(size);
|
||||
res_data.reserve(data.size());
|
||||
|
||||
ColumnString::Offset prev_src_offset = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
ColumnString::Offset curr_src_offset = offsets[i];
|
||||
Lexer lexer(reinterpret_cast<const char *>(&data[prev_src_offset]), reinterpret_cast<const char *>(&data[curr_src_offset - 1]));
|
||||
prev_src_offset = offsets[i];
|
||||
|
||||
/// Coalesce whitespace characters and comments to a single whitespace.
|
||||
bool prev_insignificant = false;
|
||||
|
||||
/// Coalesce a list of comma separated literals to a single '?..' sequence.
|
||||
size_t num_literals_in_sequence = 0;
|
||||
bool prev_comma = false;
|
||||
bool prev_whitespace = false;
|
||||
|
||||
while (true)
|
||||
{
|
||||
Token token = lexer.nextToken();
|
||||
|
||||
if (!token.isSignificant())
|
||||
{
|
||||
/// Replace a sequence of insignificant tokens with single whitespace.
|
||||
if (!prev_insignificant)
|
||||
{
|
||||
if (0 == num_literals_in_sequence)
|
||||
res_data.push_back(' ');
|
||||
else
|
||||
prev_whitespace = true;
|
||||
}
|
||||
prev_insignificant = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
prev_insignificant = false;
|
||||
|
||||
/// Literals.
|
||||
if (token.type == TokenType::Number || token.type == TokenType::StringLiteral)
|
||||
{
|
||||
if (0 == num_literals_in_sequence)
|
||||
res_data.push_back('?');
|
||||
++num_literals_in_sequence;
|
||||
prev_whitespace = false;
|
||||
prev_comma = false;
|
||||
continue;
|
||||
}
|
||||
else if (token.type == TokenType::Comma)
|
||||
{
|
||||
if (num_literals_in_sequence)
|
||||
{
|
||||
prev_comma = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (num_literals_in_sequence > 1)
|
||||
{
|
||||
res_data.push_back('.');
|
||||
res_data.push_back('.');
|
||||
}
|
||||
|
||||
if (prev_comma)
|
||||
res_data.push_back(',');
|
||||
|
||||
if (prev_whitespace)
|
||||
res_data.push_back(' ');
|
||||
|
||||
num_literals_in_sequence = 0;
|
||||
prev_comma = false;
|
||||
prev_whitespace = false;
|
||||
}
|
||||
|
||||
/// Slightly normalize something that look like aliases - if they are complex, replace them to `?` placeholders.
|
||||
if (token.type == TokenType::QuotedIdentifier
|
||||
/// Differentiate identifier from function (example: SHA224(x)).
|
||||
/// By the way, there is padding in columns and pointer dereference is Ok.
|
||||
|| (token.type == TokenType::BareWord && *token.end != '('))
|
||||
{
|
||||
/// Identifier is complex if it contains whitespace or more than two digits.
|
||||
size_t num_digits = 0;
|
||||
|
||||
const char * pos = token.begin;
|
||||
for (; pos != token.end; ++pos)
|
||||
{
|
||||
if (isWhitespaceASCII(*pos))
|
||||
break;
|
||||
|
||||
if (isNumericASCII(*pos))
|
||||
{
|
||||
++num_digits;
|
||||
if (num_digits > 2)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pos == token.end)
|
||||
{
|
||||
res_data.insert(token.begin, token.end);
|
||||
}
|
||||
else
|
||||
{
|
||||
res_data.push_back('`');
|
||||
res_data.push_back('?');
|
||||
res_data.push_back('`');
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (token.isEnd() || token.isError())
|
||||
break;
|
||||
|
||||
res_data.insert(token.begin, token.end);
|
||||
}
|
||||
|
||||
res_data.push_back(0);
|
||||
res_offsets[i] = res_data.size();
|
||||
}
|
||||
}
|
||||
|
||||
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
||||
{
|
||||
throw Exception("Cannot apply function normalizeQuery to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
};
|
||||
|
||||
struct Name
|
||||
{
|
||||
static constexpr auto name = "normalizeQuery";
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
void registerFunctionNormalizeQuery(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionStringToString<Impl, Name>>();
|
||||
}
|
||||
|
||||
}
|
||||
|
183
src/Functions/normalizedQueryHash.cpp
Normal file
183
src/Functions/normalizedQueryHash.cpp
Normal file
@ -0,0 +1,183 @@
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Parsers/Lexer.h>
|
||||
#include <common/find_symbols.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Common/SipHash.h>
|
||||
|
||||
|
||||
/** The function returns 64bit hash value that is identical for similar queries.
|
||||
* See also 'normalizeQuery'. This function is only slightly more efficient.
|
||||
*/
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
struct Impl
|
||||
{
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
PaddedPODArray<UInt64> & res_data)
|
||||
{
|
||||
size_t size = offsets.size();
|
||||
res_data.resize(size);
|
||||
|
||||
ColumnString::Offset prev_src_offset = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
SipHash hash;
|
||||
|
||||
ColumnString::Offset curr_src_offset = offsets[i];
|
||||
Lexer lexer(reinterpret_cast<const char *>(&data[prev_src_offset]), reinterpret_cast<const char *>(&data[curr_src_offset - 1]));
|
||||
prev_src_offset = offsets[i];
|
||||
|
||||
/// Coalesce a list of comma separated literals.
|
||||
size_t num_literals_in_sequence = 0;
|
||||
bool prev_comma = false;
|
||||
|
||||
while (true)
|
||||
{
|
||||
Token token = lexer.nextToken();
|
||||
|
||||
if (!token.isSignificant())
|
||||
continue;
|
||||
|
||||
/// Literals.
|
||||
if (token.type == TokenType::Number || token.type == TokenType::StringLiteral)
|
||||
{
|
||||
if (0 == num_literals_in_sequence)
|
||||
hash.update("\x00", 1);
|
||||
++num_literals_in_sequence;
|
||||
prev_comma = false;
|
||||
continue;
|
||||
}
|
||||
else if (token.type == TokenType::Comma)
|
||||
{
|
||||
if (num_literals_in_sequence)
|
||||
{
|
||||
prev_comma = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (num_literals_in_sequence > 1)
|
||||
hash.update("\x00", 1);
|
||||
|
||||
if (prev_comma)
|
||||
hash.update(",", 1);
|
||||
|
||||
num_literals_in_sequence = 0;
|
||||
prev_comma = false;
|
||||
}
|
||||
|
||||
/// Slightly normalize something that look like aliases - if they are complex, replace them to `?` placeholders.
|
||||
if (token.type == TokenType::QuotedIdentifier
|
||||
/// Differentiate identifier from function (example: SHA224(x)).
|
||||
/// By the way, there is padding in columns and pointer dereference is Ok.
|
||||
|| (token.type == TokenType::BareWord && *token.end != '('))
|
||||
{
|
||||
/// Identifier is complex if it contains whitespace or more than two digits.
|
||||
size_t num_digits = 0;
|
||||
|
||||
const char * pos = token.begin;
|
||||
for (; pos != token.end; ++pos)
|
||||
{
|
||||
if (isWhitespaceASCII(*pos))
|
||||
break;
|
||||
|
||||
if (isNumericASCII(*pos))
|
||||
{
|
||||
++num_digits;
|
||||
if (num_digits > 2)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pos == token.end)
|
||||
hash.update(token.begin, token.size());
|
||||
else
|
||||
hash.update("\x01", 1);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (token.isEnd() || token.isError())
|
||||
break;
|
||||
|
||||
hash.update(token.begin, token.size());
|
||||
}
|
||||
|
||||
res_data[i] = hash.get64();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class FunctionNormalizedQueryHash : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "normalizedQueryHash";
|
||||
static FunctionPtr create(const Context &)
|
||||
{
|
||||
return std::make_shared<FunctionNormalizedQueryHash>();
|
||||
}
|
||||
|
||||
String getName() const override
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
size_t getNumberOfArguments() const override
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
return std::make_shared<DataTypeUInt64>();
|
||||
}
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) const override
|
||||
{
|
||||
const ColumnPtr column = block.getByPosition(arguments[0]).column;
|
||||
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
|
||||
{
|
||||
auto col_res = ColumnUInt64::create();
|
||||
typename ColumnUInt64::Container & vec_res = col_res->getData();
|
||||
vec_res.resize(col->size());
|
||||
Impl::vector(col->getChars(), col->getOffsets(), vec_res);
|
||||
block.getByPosition(result).column = std::move(col_res);
|
||||
}
|
||||
else
|
||||
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(),
|
||||
ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
void registerFunctionNormalizedQueryHash(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionNormalizedQueryHash>();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -30,6 +30,8 @@ void registerFunctionStartsWith(FunctionFactory &);
|
||||
void registerFunctionEndsWith(FunctionFactory &);
|
||||
void registerFunctionTrim(FunctionFactory &);
|
||||
void registerFunctionRegexpQuoteMeta(FunctionFactory &);
|
||||
void registerFunctionNormalizeQuery(FunctionFactory &);
|
||||
void registerFunctionNormalizedQueryHash(FunctionFactory &);
|
||||
|
||||
#if USE_BASE64
|
||||
void registerFunctionBase64Encode(FunctionFactory &);
|
||||
@ -62,6 +64,8 @@ void registerFunctionsString(FunctionFactory & factory)
|
||||
registerFunctionEndsWith(factory);
|
||||
registerFunctionTrim(factory);
|
||||
registerFunctionRegexpQuoteMeta(factory);
|
||||
registerFunctionNormalizeQuery(factory);
|
||||
registerFunctionNormalizedQueryHash(factory);
|
||||
#if USE_BASE64
|
||||
registerFunctionBase64Encode(factory);
|
||||
registerFunctionBase64Decode(factory);
|
||||
|
@ -17,6 +17,7 @@ ADDINCL(
|
||||
|
||||
PEERDIR(
|
||||
clickhouse/src/Common
|
||||
clickhouse/src/Parsers
|
||||
clickhouse/src/Dictionaries
|
||||
contrib/libs/farmhash
|
||||
contrib/libs/fastops/fastops
|
||||
|
22
tests/queries/0_stateless/01451_normalize_query.reference
Normal file
22
tests/queries/0_stateless/01451_normalize_query.reference
Normal file
@ -0,0 +1,22 @@
|
||||
SELECT ?
|
||||
SELECT ?
|
||||
SELECT ?..
|
||||
SELECT ?..,
|
||||
SELECT ?..,
|
||||
SELECT ?..
|
||||
SELECT ?.. WHERE ?
|
||||
SELECT ?.. WHERE ? = ?
|
||||
SELECT ?.. WHERE ? = ? AND (x, y)
|
||||
SELECT ?.. WHERE ? = ? AND (?, y)
|
||||
[?..]
|
||||
[?.., x]
|
||||
SELECT ?.. WHERE ? = ? AND (?, y) LIMIT ?..
|
||||
SELECT ? AS `xyz`
|
||||
SELECT ? AS `xyz1`
|
||||
SELECT ? AS `xyz11`
|
||||
SELECT ? AS `?`
|
||||
SELECT ? AS xyz1
|
||||
SELECT ? AS xyz11
|
||||
SELECT ? xyz11
|
||||
SELECT ?, xyz11
|
||||
SELECT ?..
|
22
tests/queries/0_stateless/01451_normalize_query.sql
Normal file
22
tests/queries/0_stateless/01451_normalize_query.sql
Normal file
@ -0,0 +1,22 @@
|
||||
SELECT normalizeQuery('SELECT 1');
|
||||
SELECT normalizeQuery('SELECT 1');
|
||||
SELECT normalizeQuery('SELECT 1, 1, 1');
|
||||
SELECT normalizeQuery('SELECT 1, 1, 1, /* Hwllo */');
|
||||
SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */');
|
||||
SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\'');
|
||||
SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\' WHERE 1');
|
||||
SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\' WHERE 1 = 1');
|
||||
SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\' WHERE 1 = 1 AND (x, y)');
|
||||
SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\' WHERE 1 = 1 AND (1, y)');
|
||||
SELECT normalizeQuery('[1, 2, 3]');
|
||||
SELECT normalizeQuery('[1, 2, 3, x]');
|
||||
SELECT normalizeQuery('SELECT 1, 1, 1, /* Hello */ \'abc\' WHERE 1 = 1 AND (1, y) LIMIT 1, 1');
|
||||
SELECT normalizeQuery('SELECT 1 AS `xyz`');
|
||||
SELECT normalizeQuery('SELECT 1 AS `xyz1`');
|
||||
SELECT normalizeQuery('SELECT 1 AS `xyz11`');
|
||||
SELECT normalizeQuery('SELECT 1 AS xyz111');
|
||||
SELECT normalizeQuery('SELECT 1 AS xyz1');
|
||||
SELECT normalizeQuery('SELECT 1 AS xyz11');
|
||||
SELECT normalizeQuery('SELECT 1 xyz11');
|
||||
SELECT normalizeQuery('SELECT 1, xyz11');
|
||||
SELECT normalizeQuery('SELECT 1, ''xyz11''');
|
@ -0,0 +1,7 @@
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
@ -0,0 +1,7 @@
|
||||
SELECT normalizedQueryHash('SELECT 1') = normalizedQueryHash('SELECT 2');
|
||||
SELECT normalizedQueryHash('SELECT 1') != normalizedQueryHash('SELECT 1, 1, 1');
|
||||
SELECT normalizedQueryHash('SELECT 1, 1, 1, /* Hello */ \'abc\'') = normalizedQueryHash('SELECT 2, 3');
|
||||
SELECT normalizedQueryHash('[1, 2, 3]') = normalizedQueryHash('[1, ''x'']');
|
||||
SELECT normalizedQueryHash('[1, 2, 3, x]') != normalizedQueryHash('[1, x]');
|
||||
SELECT normalizedQueryHash('SELECT 1 AS `xyz`') != normalizedQueryHash('SELECT 1 AS `abc`');
|
||||
SELECT normalizedQueryHash('SELECT 1 AS xyz111') = normalizedQueryHash('SELECT 2 AS xyz234');
|
Loading…
Reference in New Issue
Block a user