Add normalizedQueryHash function with tests

This commit is contained in:
Alexey Milovidov 2020-08-16 22:38:56 +03:00
parent 6a55ec0390
commit 9cd2ddb646
4 changed files with 198 additions and 0 deletions

View File

@ -0,0 +1,182 @@
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <Functions/FunctionFactory.h>
#include <Parsers/Lexer.h>
#include <common/find_symbols.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/SipHash.h>
/** The function returns 64bit hash value that is identical for similar queries.
* See also 'normalizeQuery'. This function is only slightly more efficient.
*/
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
}
namespace
{
struct Impl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
PaddedPODArray<UInt64> & res_data)
{
size_t size = offsets.size();
res_data.resize(size);
ColumnString::Offset prev_src_offset = 0;
for (size_t i = 0; i < size; ++i)
{
SipHash hash;
ColumnString::Offset curr_src_offset = offsets[i];
Lexer lexer(reinterpret_cast<const char *>(&data[prev_src_offset]), reinterpret_cast<const char *>(&data[curr_src_offset - 1]));
prev_src_offset = offsets[i];
/// Coalesce a list of comma separated literals.
size_t num_literals_in_sequence = 0;
bool prev_comma = false;
while (true)
{
Token token = lexer.nextToken();
if (!token.isSignificant())
continue;
/// Literals.
if (token.type == TokenType::Number || token.type == TokenType::StringLiteral)
{
if (0 == num_literals_in_sequence)
hash.update("\x00", 1);
++num_literals_in_sequence;
prev_comma = false;
continue;
}
else if (token.type == TokenType::Comma)
{
if (num_literals_in_sequence)
{
prev_comma = true;
continue;
}
}
else
{
if (num_literals_in_sequence > 1)
hash.update("\x00", 1);
if (prev_comma)
hash.update(",", 1);
num_literals_in_sequence = 0;
prev_comma = false;
}
/// Slightly normalize something that look like aliases - if they are complex, replace them to `?` placeholders.
if (token.type == TokenType::QuotedIdentifier
/// Differentiate identifier from function (example: SHA224(x)).
/// By the way, there is padding in columns and pointer dereference is Ok.
|| (token.type == TokenType::BareWord && *token.end != '('))
{
/// Identifier is complex if it contains whitespace or more than two digits.
size_t num_digits = 0;
const char * pos = token.begin;
for (; pos != token.end; ++pos)
{
if (isWhitespaceASCII(*pos))
break;
if (isNumericASCII(*pos))
{
++num_digits;
if (num_digits > 2)
break;
}
}
if (pos == token.end)
hash.update(token.begin, token.size());
else
hash.update("\x01", 1);
continue;
}
if (token.isEnd() || token.isError())
break;
hash.update(token.begin, token.size());
}
res_data[i] = hash.get64();
}
}
};
class FunctionNormalizedQueryHash : public IFunction
{
public:
static constexpr auto name = "normalizedQueryHash";
static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionNormalizedQueryHash>();
}
String getName() const override
{
return name;
}
size_t getNumberOfArguments() const override
{
return 1;
}
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeUInt64>();
}
bool useDefaultImplementationForConstants() const override { return true; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) const override
{
const ColumnPtr column = block.getByPosition(arguments[0]).column;
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
{
auto col_res = ColumnUInt64::create();
typename ColumnUInt64::Container & vec_res = col_res->getData();
vec_res.resize(col->size());
Impl::vector(col->getChars(), col->getOffsets(), vec_res);
block.getByPosition(result).column = std::move(col_res);
}
else
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
};
}
void registerFunctionNormalizedQueryHash(FunctionFactory & factory)
{
factory.registerFunction<FunctionNormalizedQueryHash>();
}
}

View File

@ -31,6 +31,7 @@ void registerFunctionEndsWith(FunctionFactory &);
void registerFunctionTrim(FunctionFactory &);
void registerFunctionRegexpQuoteMeta(FunctionFactory &);
void registerFunctionNormalizeQuery(FunctionFactory &);
void registerFunctionNormalizedQueryHash(FunctionFactory &);
#if USE_BASE64
void registerFunctionBase64Encode(FunctionFactory &);
@ -64,6 +65,7 @@ void registerFunctionsString(FunctionFactory & factory)
registerFunctionTrim(factory);
registerFunctionRegexpQuoteMeta(factory);
registerFunctionNormalizeQuery(factory);
registerFunctionNormalizedQueryHash(factory);
#if USE_BASE64
registerFunctionBase64Encode(factory);
registerFunctionBase64Decode(factory);

View File

@ -0,0 +1,7 @@
1
1
1
1
1
1
1

View File

@ -0,0 +1,7 @@
SELECT normalizedQueryHash('SELECT 1') = normalizedQueryHash('SELECT 2');
SELECT normalizedQueryHash('SELECT 1') != normalizedQueryHash('SELECT 1, 1, 1');
SELECT normalizedQueryHash('SELECT 1, 1, 1, /* Hello */ \'abc\'') = normalizedQueryHash('SELECT 2, 3');
SELECT normalizedQueryHash('[1, 2, 3]') = normalizedQueryHash('[1, ''x'']');
SELECT normalizedQueryHash('[1, 2, 3, x]') != normalizedQueryHash('[1, x]');
SELECT normalizedQueryHash('SELECT 1 AS `xyz`') != normalizedQueryHash('SELECT 1 AS `abc`');
SELECT normalizedQueryHash('SELECT 1 AS xyz111') = normalizedQueryHash('SELECT 2 AS xyz234');