Add xxh3 hash function (#43411)

* impl

* try fix

* add docs

* add test

* rm unused file

* excellent
This commit is contained in:
Nikita Taranov 2022-11-26 00:14:08 +01:00 committed by GitHub
parent dc7742e8f9
commit d1c258cf20
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 63 additions and 8 deletions

3
.gitmodules vendored
View File

@ -290,3 +290,6 @@
[submodule "contrib/morton-nd"]
path = contrib/morton-nd
url = https://github.com/morton-nd/morton-nd
[submodule "contrib/xxHash"]
path = contrib/xxHash
url = https://github.com/Cyan4973/xxHash.git

View File

@ -167,7 +167,9 @@ add_contrib (c-ares-cmake c-ares)
add_contrib (qpl-cmake qpl)
add_contrib (morton-nd-cmake morton-nd)
add_contrib(annoy-cmake annoy)
add_contrib (annoy-cmake annoy)
add_contrib (xxHash-cmake xxHash)
# Put all targets defined here and in subdirectories under "contrib/<immediate-subdir>" folders in GUI-based IDEs.
# Some of third-party projects may override CMAKE_FOLDER or FOLDER property of their targets, so they would not appear

1
contrib/xxHash vendored Submodule

@ -0,0 +1 @@
Subproject commit 3078dc6039f8c0bffcb1904f81cfe6b2c3209435

View File

@ -0,0 +1,13 @@
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/xxHash")
set (SRCS
"${LIBRARY_DIR}/xxhash.c"
)
add_library(xxHash ${SRCS})
target_include_directories(xxHash SYSTEM BEFORE INTERFACE "${LIBRARY_DIR}")
# XXH_INLINE_ALL - Make all functions inline, with implementations being directly included within xxhash.h. Inlining functions is beneficial for speed on small keys.
# https://github.com/Cyan4973/xxHash/tree/v0.8.1#build-modifiers
target_compile_definitions(xxHash PUBLIC XXH_INLINE_ALL)
add_library(ch_contrib::xxHash ALIAS xxHash)

View File

@ -137,6 +137,7 @@ function clone_submodules
contrib/hashidsxx
contrib/c-ares
contrib/morton-nd
contrib/xxHash
)
git submodule sync

View File

@ -29,9 +29,9 @@ list (APPEND PRIVATE_LIBS
ch_contrib::zlib
boost::filesystem
divide_impl
ch_contrib::xxHash
)
if (TARGET ch_rust::blake3)
list (APPEND PUBLIC_LIBS
ch_rust::blake3
@ -66,8 +66,6 @@ if (TARGET ch_contrib::base64)
list (APPEND PRIVATE_LIBS ch_contrib::base64)
endif()
list (APPEND PRIVATE_LIBS ch_contrib::lz4)
if (ENABLE_NLP)
list (APPEND PRIVATE_LIBS ch_contrib::cld2)
endif()

View File

@ -39,6 +39,13 @@ REGISTER_FUNCTION(Hashing)
factory.registerFunction<FunctionXxHash32>();
factory.registerFunction<FunctionXxHash64>();
factory.registerFunction<FunctionXXH3>(
{
"Calculates value of XXH3 64-bit hash function. Refer to https://github.com/Cyan4973/xxHash for detailed documentation.",
Documentation::Examples{{"hash", "SELECT xxh3('ClickHouse')"}},
Documentation::Categories{"Hash"}
},
FunctionFactory::CaseSensitive);
factory.registerFunction<FunctionWyHash64>();

View File

@ -3,12 +3,18 @@
#include <city.h>
#include <farmhash.h>
#include <metrohash.h>
#include <wyhash.h>
#include <MurmurHash2.h>
#include <MurmurHash3.h>
#include <wyhash.h>
#include "config.h"
#ifdef __clang__
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wused-but-marked-unused"
#endif
#include <xxhash.h>
#if USE_BLAKE3
# include <blake3.h>
#endif
@ -17,7 +23,6 @@
#include <Common/typeid_cast.h>
#include <Common/safe_cast.h>
#include <Common/HashTable/Hash.h>
#include <xxhash.h>
#if USE_SSL
# include <openssl/md4.h>
@ -588,7 +593,7 @@ struct ImplXxHash32
static constexpr auto name = "xxHash32";
using ReturnType = UInt32;
static auto apply(const char * s, const size_t len) { return XXH32(s, len, 0); }
static auto apply(const char * s, const size_t len) { return XXH_INLINE_XXH32(s, len, 0); }
/**
* With current implementation with more than 1 arguments it will give the results
* non-reproducible from outside of CH.
@ -609,7 +614,24 @@ struct ImplXxHash64
using ReturnType = UInt64;
using uint128_t = CityHash_v1_0_2::uint128;
static auto apply(const char * s, const size_t len) { return XXH64(s, len, 0); }
static auto apply(const char * s, const size_t len) { return XXH_INLINE_XXH64(s, len, 0); }
/*
With current implementation with more than 1 arguments it will give the results
non-reproducible from outside of CH. (see comment on ImplXxHash32).
*/
static auto combineHashes(UInt64 h1, UInt64 h2) { return CityHash_v1_0_2::Hash128to64(uint128_t(h1, h2)); }
static constexpr bool use_int_hash_for_pods = false;
};
struct ImplXXH3
{
static constexpr auto name = "xxh3";
using ReturnType = UInt64;
using uint128_t = CityHash_v1_0_2::uint128;
static auto apply(const char * s, const size_t len) { return XXH_INLINE_XXH3_64bits(s, len); }
/*
With current implementation with more than 1 arguments it will give the results
@ -1508,7 +1530,12 @@ using FunctionHiveHash = FunctionAnyHash<HiveHashImpl>;
using FunctionXxHash32 = FunctionAnyHash<ImplXxHash32>;
using FunctionXxHash64 = FunctionAnyHash<ImplXxHash64>;
using FunctionXXH3 = FunctionAnyHash<ImplXXH3>;
using FunctionWyHash64 = FunctionAnyHash<ImplWyHash64>;
using FunctionBLAKE3 = FunctionStringHashFixedString<ImplBLAKE3>;
}
#ifdef __clang__
# pragma clang diagnostic pop
#endif

View File

@ -15,6 +15,7 @@
<value>hiveHash</value>
<value>xxHash32</value>
<value>xxHash64</value>
<value>xxh3</value>
<value>CRC32</value>
</values>
</substitution>

View File

@ -0,0 +1 @@
18009318874338624809

View File

@ -0,0 +1 @@
SELECT xxh3('ClickHouse');