From c16ce7657ec2daf7eb0699096e909193689143a3 Mon Sep 17 00:00:00 2001 From: Michael Nutt Date: Fri, 6 May 2022 21:25:20 -0400 Subject: [PATCH] add hashid support --- .gitmodules | 3 + contrib/CMakeLists.txt | 1 + contrib/hashidsxx | 1 + contrib/hashidsxx-cmake/CMakeLists.txt | 21 +++ src/Functions/CMakeLists.txt | 4 + src/Functions/FunctionHashID.cpp | 13 ++ src/Functions/FunctionHashID.h | 153 ++++++++++++++++++ src/Functions/registerFunctions.cpp | 2 + src/configure_config.cmake | 3 + .../0_stateless/02293_hashid.reference | 5 + tests/queries/0_stateless/02293_hashid.sql | 1 + .../02293_hashid_arguments.reference | 5 + .../0_stateless/02293_hashid_arguments.sql | 1 + .../0_stateless/02293_hashid_const.reference | 1 + .../0_stateless/02293_hashid_const.sql | 1 + 15 files changed, 215 insertions(+) create mode 160000 contrib/hashidsxx create mode 100644 contrib/hashidsxx-cmake/CMakeLists.txt create mode 100644 src/Functions/FunctionHashID.cpp create mode 100644 src/Functions/FunctionHashID.h create mode 100644 tests/queries/0_stateless/02293_hashid.reference create mode 100644 tests/queries/0_stateless/02293_hashid.sql create mode 100644 tests/queries/0_stateless/02293_hashid_arguments.reference create mode 100644 tests/queries/0_stateless/02293_hashid_arguments.sql create mode 100644 tests/queries/0_stateless/02293_hashid_const.reference create mode 100644 tests/queries/0_stateless/02293_hashid_const.sql diff --git a/.gitmodules b/.gitmodules index 6c9e66f9cbc..0972ab6a88a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -262,3 +262,6 @@ [submodule "contrib/minizip-ng"] path = contrib/minizip-ng url = https://github.com/zlib-ng/minizip-ng +[submodule "contrib/hashidsxx"] + path = contrib/hashidsxx + url = https://github.com/schoentoon/hashidsxx.git diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 1f03c0fd341..627885c67b3 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -139,6 +139,7 @@ add_contrib (libpq-cmake libpq) add_contrib (nuraft-cmake NuRaft) add_contrib (fast_float-cmake fast_float) add_contrib (datasketches-cpp-cmake datasketches-cpp) +add_contrib (hashidsxx-cmake hashidsxx) option(ENABLE_NLP "Enable NLP functions support" ${ENABLE_LIBRARIES}) if (ENABLE_NLP) diff --git a/contrib/hashidsxx b/contrib/hashidsxx new file mode 160000 index 00000000000..783f6911ccf --- /dev/null +++ b/contrib/hashidsxx @@ -0,0 +1 @@ +Subproject commit 783f6911ccfdaca83e3cfac084c4aad888a80cee diff --git a/contrib/hashidsxx-cmake/CMakeLists.txt b/contrib/hashidsxx-cmake/CMakeLists.txt new file mode 100644 index 00000000000..f916355251d --- /dev/null +++ b/contrib/hashidsxx-cmake/CMakeLists.txt @@ -0,0 +1,21 @@ +option(ENABLE_HASHIDSXX "Enable hashidsxx" ${ENABLE_LIBRARIES}) + +if (NOT ENABLE_HASHIDSXX) + message(STATUS "Not using hashidsxx") + return() +endif() + +set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/hashidsxx") + +set (SRCS + "${LIBRARY_DIR}/hashids.cpp" +) + +set (HDRS + "${LIBRARY_DIR}/hashids.h" +) + +add_library(_hashidsxx ${SRCS} ${HDRS}) +target_include_directories(_hashidsxx SYSTEM PUBLIC "${LIBRARY_DIR}") + +add_library(ch_contrib::hashidsxx ALIAS _hashidsxx) diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index debe7fac8a5..a982ee367de 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -96,6 +96,10 @@ if (TARGET ch_contrib::rapidjson) target_link_libraries(clickhouse_functions PRIVATE ch_contrib::rapidjson) endif() +if (TARGET ch_contrib::hashidsxx) + target_link_libraries(clickhouse_functions PRIVATE ch_contrib::hashidsxx) +endif() + add_subdirectory(GatherUtils) target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_gatherutils) diff --git a/src/Functions/FunctionHashID.cpp b/src/Functions/FunctionHashID.cpp new file mode 100644 index 00000000000..14e0c7c35f3 --- /dev/null +++ b/src/Functions/FunctionHashID.cpp @@ -0,0 +1,13 @@ +#include "FunctionHashID.h" + +#include + + +namespace DB +{ + +void registerFunctionHashID(FunctionFactory & factory) +{ + factory.registerFunction(); +} +} diff --git a/src/Functions/FunctionHashID.h b/src/Functions/FunctionHashID.h new file mode 100644 index 00000000000..3443b6d8408 --- /dev/null +++ b/src/Functions/FunctionHashID.h @@ -0,0 +1,153 @@ +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; +} + +// hashid(string, salt) +class FunctionHashID : public IFunction +{ +public: + static constexpr auto name = "hashid"; + + static FunctionPtr create(ContextPtr) { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.size() < 1) + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} expects at least one argument", getName()); + + if (!isUnsignedInteger(arguments[0].type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument of function {} must be unsigned integer, got {}", getName(), arguments[0].type->getName()); + + if (arguments.size() > 1) + { + if (!isString(arguments[1].type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Second argument of function {} must be String, got {}", + getName(), arguments[1].type->getName()); + } + + if (arguments.size() > 2) + { + if (!isUInt8(arguments[2].type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Third argument of function {} must be UInt8, got {}", + getName(), arguments[2].type->getName()); + } + + if (arguments.size() > 3) + { + if (!isString(arguments[3].type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Fourth argument of function {} must be String, got {}", + getName(), arguments[3].type->getName()); + } + + if (arguments.size() > 4) + { + throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION, + "Function {} expect no more than three arguments (integer, salt, optional_alphabet), got {}", + getName(), arguments.size()); + } + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto & numcolumn = arguments[0].column; + + if ( + checkAndGetColumn(numcolumn.get()) + || checkAndGetColumn(numcolumn.get()) + || checkAndGetColumn(numcolumn.get()) + || checkAndGetColumn(numcolumn.get()) + || checkAndGetColumnConst(numcolumn.get()) + || checkAndGetColumnConst(numcolumn.get()) + || checkAndGetColumnConst(numcolumn.get()) + || checkAndGetColumnConst(numcolumn.get()) + ) + { + std::string salt; + UInt8 minLength = 0; + std::string alphabet(DEFAULT_ALPHABET); + + if (arguments.size() >= 4) + { + const auto & alphabetcolumn = arguments[3].column; + if (auto alpha_col = checkAndGetColumnConst(alphabetcolumn.get())) + alphabet = alpha_col->getValue(); + } + + if (arguments.size() >= 3) + { + const auto & minlengthcolumn = arguments[2].column; + if (auto min_length_col = checkAndGetColumnConst(minlengthcolumn.get())) + minLength = min_length_col->getValue(); + } + + if (arguments.size() >= 2) + { + const auto & saltcolumn = arguments[1].column; + if (auto salt_col = checkAndGetColumnConst(saltcolumn.get())) + salt = salt_col->getValue(); + } + + hashidsxx::Hashids hash(salt, minLength, alphabet); + + auto col_res = ColumnString::create(); + + for (size_t i = 0; i < input_rows_count; ++i) + { + std::string hashid = hash.encode({ numcolumn->getUInt(i) }); + col_res->insertDataWithTerminatingZero(hashid.data(), hashid.size() + 1); + } + + return col_res; + } + else + throw Exception("Illegal column " + arguments[0].column->getName() + + " of first argument of function hashid", + ErrorCodes::ILLEGAL_COLUMN); + + + } +}; + +} diff --git a/src/Functions/registerFunctions.cpp b/src/Functions/registerFunctions.cpp index 6b3c6e92945..2472b78cbcd 100644 --- a/src/Functions/registerFunctions.cpp +++ b/src/Functions/registerFunctions.cpp @@ -24,6 +24,7 @@ void registerFunctionsEmbeddedDictionaries(FunctionFactory &); void registerFunctionsExternalDictionaries(FunctionFactory &); void registerFunctionsExternalModels(FunctionFactory &); void registerFunctionsFormatting(FunctionFactory &); +void registerFunctionHashID(FunctionFactory &); void registerFunctionsHashing(FunctionFactory &); void registerFunctionsHigherOrder(FunctionFactory &); void registerFunctionsLogical(FunctionFactory &); @@ -90,6 +91,7 @@ void registerFunctions() registerFunctionsExternalDictionaries(factory); registerFunctionsExternalModels(factory); registerFunctionsFormatting(factory); + registerFunctionHashID(factory); registerFunctionsHashing(factory); registerFunctionsHigherOrder(factory); registerFunctionsLogical(factory); diff --git a/src/configure_config.cmake b/src/configure_config.cmake index 519307ba28a..aa1419c7792 100644 --- a/src/configure_config.cmake +++ b/src/configure_config.cmake @@ -100,3 +100,6 @@ endif() if (TARGET ch_contrib::jemalloc) set(USE_JEMALLOC 1) endif() +if (TARGET ch_contrib::hashidsxx) + set(USE_HASHIDSXX 1) +endif() diff --git a/tests/queries/0_stateless/02293_hashid.reference b/tests/queries/0_stateless/02293_hashid.reference new file mode 100644 index 00000000000..05023857670 --- /dev/null +++ b/tests/queries/0_stateless/02293_hashid.reference @@ -0,0 +1,5 @@ +0 gY +1 jR +2 k5 +3 l5 +4 mO diff --git a/tests/queries/0_stateless/02293_hashid.sql b/tests/queries/0_stateless/02293_hashid.sql new file mode 100644 index 00000000000..51bed96c039 --- /dev/null +++ b/tests/queries/0_stateless/02293_hashid.sql @@ -0,0 +1 @@ +select number, hashid(number) from system.numbers limit 5; diff --git a/tests/queries/0_stateless/02293_hashid_arguments.reference b/tests/queries/0_stateless/02293_hashid_arguments.reference new file mode 100644 index 00000000000..41f3b213cdb --- /dev/null +++ b/tests/queries/0_stateless/02293_hashid_arguments.reference @@ -0,0 +1,5 @@ +0 pbgkmdljlpjoapne +1 akemglnjepjpodba +2 obmgndljgajpkeao +3 dldokmpjpgjgeanb +4 nkdlpgajngjnobme diff --git a/tests/queries/0_stateless/02293_hashid_arguments.sql b/tests/queries/0_stateless/02293_hashid_arguments.sql new file mode 100644 index 00000000000..f1cb3a144e7 --- /dev/null +++ b/tests/queries/0_stateless/02293_hashid_arguments.sql @@ -0,0 +1 @@ +select number, hashid(number, 's3cr3t', 16, 'abcdefghijklmnop') from system.numbers limit 5; diff --git a/tests/queries/0_stateless/02293_hashid_const.reference b/tests/queries/0_stateless/02293_hashid_const.reference new file mode 100644 index 00000000000..93bd202307e --- /dev/null +++ b/tests/queries/0_stateless/02293_hashid_const.reference @@ -0,0 +1 @@ +YQrvD5XGvbx diff --git a/tests/queries/0_stateless/02293_hashid_const.sql b/tests/queries/0_stateless/02293_hashid_const.sql new file mode 100644 index 00000000000..b8308d3f55b --- /dev/null +++ b/tests/queries/0_stateless/02293_hashid_const.sql @@ -0,0 +1 @@ +select hashid(1234567890123456, 's3cr3t');