2012-07-15 21:43:04 +00:00
# pragma once
# include <city.h>
2015-07-17 15:56:08 +00:00
# include <farmhash.h>
2015-07-20 14:59:20 +00:00
# include <metrohash.h>
2020-04-16 12:31:57 +00:00
# if !defined(ARCADIA_BUILD)
# include <murmurhash2.h>
# include <murmurhash3.h>
# include "config_functions.h"
# include "config_core.h"
# endif
2018-12-23 19:25:40 +00:00
2019-02-10 17:40:52 +00:00
# include <Common/SipHash.h>
# include <Common/typeid_cast.h>
# include <Common/HashTable/Hash.h>
2018-12-23 19:25:40 +00:00
# if USE_XXHASH
2020-04-16 12:31:57 +00:00
# include <xxhash.h>
2019-03-22 11:18:24 +00:00
# endif
# if USE_SSL
2020-04-16 12:31:57 +00:00
# include <openssl / md5.h>
# include <openssl / sha.h>
2018-12-23 19:25:40 +00:00
# endif
2012-07-15 21:43:04 +00:00
# include <Poco/ByteOrder.h>
2017-04-01 09:19:00 +00:00
# include <DataTypes/DataTypesNumber.h>
# include <DataTypes/DataTypeString.h>
# include <DataTypes/DataTypeDate.h>
# include <DataTypes/DataTypeDateTime.h>
# include <DataTypes/DataTypeArray.h>
# include <DataTypes/DataTypeFixedString.h>
# include <DataTypes/DataTypeEnum.h>
2017-12-08 00:50:25 +00:00
# include <DataTypes/DataTypeTuple.h>
2017-04-01 09:19:00 +00:00
# include <Columns/ColumnsNumber.h>
# include <Columns/ColumnString.h>
# include <Columns/ColumnConst.h>
# include <Columns/ColumnFixedString.h>
# include <Columns/ColumnArray.h>
# include <Columns/ColumnTuple.h>
2019-12-09 13:12:54 +00:00
# include <Functions/IFunctionImpl.h>
2017-07-21 06:35:58 +00:00
# include <Functions/FunctionHelpers.h>
2020-05-25 15:16:19 +00:00
# include <Functions/TargetSpecific.h>
# include <Functions/PerformanceAdaptors.h>
2017-06-06 17:18:32 +00:00
# include <ext/range.h>
2018-07-30 18:00:16 +00:00
# include <ext/bit_cast.h>
2015-04-15 15:00:28 +00:00
2012-07-15 21:43:04 +00:00
namespace DB
{
2017-06-13 02:06:53 +00:00
namespace ErrorCodes
{
2020-02-25 18:02:41 +00:00
extern const int ILLEGAL_TYPE_OF_ARGUMENT ;
extern const int BAD_ARGUMENTS ;
2017-06-13 02:06:53 +00:00
extern const int LOGICAL_ERROR ;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH ;
2018-12-18 20:24:16 +00:00
extern const int NOT_IMPLEMENTED ;
2019-03-14 23:10:51 +00:00
extern const int ILLEGAL_COLUMN ;
2017-06-13 02:06:53 +00:00
}
2017-05-27 15:45:25 +00:00
/** Hashing functions.
2012-07-15 21:43:04 +00:00
*
2017-10-26 18:36:23 +00:00
* halfMD5 : String - > UInt64
2012-07-15 21:43:04 +00:00
*
2017-05-27 15:45:25 +00:00
* A faster cryptographic hash function :
2017-10-26 18:36:23 +00:00
* sipHash64 : String - > UInt64
2013-10-21 14:35:12 +00:00
*
2017-05-27 15:45:25 +00:00
* Fast non - cryptographic hash function for strings :
2012-07-15 21:43:04 +00:00
* cityHash64 : String - > UInt64
*
2018-10-23 17:43:09 +00:00
* A non - cryptographic hashes from a tuple of values of any types ( uses respective function for strings and intHash64 for numbers ) :
2017-10-26 18:36:23 +00:00
* cityHash64 : any * - > UInt64
2018-10-23 17:43:09 +00:00
* sipHash64 : any * - > UInt64
* halfMD5 : any * - > UInt64
2014-07-04 09:42:56 +00:00
*
2017-05-27 15:45:25 +00:00
* Fast non - cryptographic hash function from any integer :
2017-10-26 18:36:23 +00:00
* intHash32 : number - > UInt32
* intHash64 : number - > UInt64
2014-07-03 11:24:01 +00:00
*
2012-07-15 21:43:04 +00:00
*/
2018-10-23 17:43:09 +00:00
struct IntHash32Impl
{
using ReturnType = UInt32 ;
static UInt32 apply ( UInt64 x )
{
/// seed is taken from /dev/urandom. It allows you to avoid undesirable dependencies with hashes in different data structures.
return intHash32 < 0x75D9543DE018BF45ULL > ( x ) ;
}
} ;
struct IntHash64Impl
{
using ReturnType = UInt64 ;
static UInt64 apply ( UInt64 x )
{
return intHash64 ( x ^ 0x4CF2D2BAAE6DA887ULL ) ;
}
} ;
2019-03-22 11:18:24 +00:00
# if USE_SSL
2012-07-15 21:43:04 +00:00
struct HalfMD5Impl
{
2018-10-23 17:43:09 +00:00
static constexpr auto name = " halfMD5 " ;
2018-08-02 09:21:26 +00:00
using ReturnType = UInt64 ;
2017-04-01 07:20:54 +00:00
static UInt64 apply ( const char * begin , size_t size )
{
union
{
unsigned char char_data [ 16 ] ;
2018-08-21 15:56:50 +00:00
uint64_t uint64_data ;
2017-04-01 07:20:54 +00:00
} buf ;
MD5_CTX ctx ;
MD5_Init ( & ctx ) ;
MD5_Update ( & ctx , reinterpret_cast < const unsigned char * > ( begin ) , size ) ;
MD5_Final ( buf . char_data , & ctx ) ;
2018-11-01 17:07:20 +00:00
return Poco : : ByteOrder : : flipBytes ( static_cast < Poco : : UInt64 > ( buf . uint64_data ) ) ; /// Compatibility with existing code. Cast need for old poco AND macos where UInt64 != uint64_t
2017-04-01 07:20:54 +00:00
}
2018-10-23 17:43:09 +00:00
2018-10-24 13:12:59 +00:00
static UInt64 combineHashes ( UInt64 h1 , UInt64 h2 )
2018-10-23 17:43:09 +00:00
{
2018-10-24 13:12:59 +00:00
UInt64 hashes [ ] = { h1 , h2 } ;
return apply ( reinterpret_cast < const char * > ( hashes ) , 16 ) ;
2018-10-23 17:43:09 +00:00
}
2018-11-01 15:47:08 +00:00
/// If true, it will use intHash32 or intHash64 to hash POD types. This behaviour is intended for better performance of some functions.
/// Otherwise it will hash bytes in memory as a string using corresponding hash function.
2018-12-21 17:53:16 +00:00
2018-11-01 15:47:08 +00:00
static constexpr bool use_int_hash_for_pods = false ;
2012-07-15 21:43:04 +00:00
} ;
2014-10-29 12:25:33 +00:00
struct MD5Impl
{
2017-04-01 07:20:54 +00:00
static constexpr auto name = " MD5 " ;
enum { length = 16 } ;
static void apply ( const char * begin , const size_t size , unsigned char * out_char_data )
{
MD5_CTX ctx ;
MD5_Init ( & ctx ) ;
MD5_Update ( & ctx , reinterpret_cast < const unsigned char * > ( begin ) , size ) ;
MD5_Final ( out_char_data , & ctx ) ;
}
2014-10-29 12:25:33 +00:00
} ;
struct SHA1Impl
{
2017-04-01 07:20:54 +00:00
static constexpr auto name = " SHA1 " ;
enum { length = 20 } ;
static void apply ( const char * begin , const size_t size , unsigned char * out_char_data )
{
SHA_CTX ctx ;
SHA1_Init ( & ctx ) ;
SHA1_Update ( & ctx , reinterpret_cast < const unsigned char * > ( begin ) , size ) ;
SHA1_Final ( out_char_data , & ctx ) ;
}
2014-10-29 12:25:33 +00:00
} ;
struct SHA224Impl
{
2017-04-01 07:20:54 +00:00
static constexpr auto name = " SHA224 " ;
enum { length = 28 } ;
static void apply ( const char * begin , const size_t size , unsigned char * out_char_data )
{
SHA256_CTX ctx ;
SHA224_Init ( & ctx ) ;
SHA224_Update ( & ctx , reinterpret_cast < const unsigned char * > ( begin ) , size ) ;
SHA224_Final ( out_char_data , & ctx ) ;
}
2014-10-29 12:25:33 +00:00
} ;
struct SHA256Impl
{
2017-04-01 07:20:54 +00:00
static constexpr auto name = " SHA256 " ;
enum { length = 32 } ;
static void apply ( const char * begin , const size_t size , unsigned char * out_char_data )
{
SHA256_CTX ctx ;
SHA256_Init ( & ctx ) ;
SHA256_Update ( & ctx , reinterpret_cast < const unsigned char * > ( begin ) , size ) ;
SHA256_Final ( out_char_data , & ctx ) ;
}
2014-10-29 12:25:33 +00:00
} ;
2019-03-22 11:18:24 +00:00
# endif
2014-10-29 12:25:33 +00:00
2013-10-21 14:35:12 +00:00
struct SipHash64Impl
{
2018-10-23 17:43:09 +00:00
static constexpr auto name = " sipHash64 " ;
2018-08-02 09:21:26 +00:00
using ReturnType = UInt64 ;
2017-04-01 07:20:54 +00:00
static UInt64 apply ( const char * begin , size_t size )
{
return sipHash64 ( begin , size ) ;
}
2018-10-23 17:43:09 +00:00
2018-10-24 13:12:59 +00:00
static UInt64 combineHashes ( UInt64 h1 , UInt64 h2 )
2018-10-23 18:07:20 +00:00
{
2018-10-24 13:12:59 +00:00
UInt64 hashes [ ] = { h1 , h2 } ;
return apply ( reinterpret_cast < const char * > ( hashes ) , 16 ) ;
2018-10-23 17:43:09 +00:00
}
2013-10-21 14:35:12 +00:00
2018-11-01 15:47:08 +00:00
static constexpr bool use_int_hash_for_pods = false ;
} ;
2018-07-30 12:19:22 +00:00
2014-10-29 12:25:33 +00:00
struct SipHash128Impl
{
2017-04-01 07:20:54 +00:00
static constexpr auto name = " sipHash128 " ;
enum { length = 16 } ;
2014-10-29 12:25:33 +00:00
2017-04-01 07:20:54 +00:00
static void apply ( const char * begin , const size_t size , unsigned char * out_char_data )
{
sipHash128 ( begin , size , reinterpret_cast < char * > ( out_char_data ) ) ;
}
2014-10-29 12:25:33 +00:00
} ;
2020-04-16 12:31:57 +00:00
# if !defined(ARCADIA_BUILD)
2018-11-01 15:47:08 +00:00
/** Why we need MurmurHash2?
* MurmurHash2 is an outdated hash function , superseded by MurmurHash3 and subsequently by CityHash , xxHash , HighwayHash .
* Usually there is no reason to use MurmurHash .
* It is needed for the cases when you already have MurmurHash in some applications and you want to reproduce it
* in ClickHouse as is . For example , it is needed to reproduce the behaviour
* for NGINX a / b testing module : https : //nginx.ru/en/docs/http/ngx_http_split_clients_module.html
*/
struct MurmurHash2Impl32
{
static constexpr auto name = " murmurHash2_32 " ;
using ReturnType = UInt32 ;
static UInt32 apply ( const char * data , const size_t size )
{
return MurmurHash2 ( data , size , 0 ) ;
}
static UInt32 combineHashes ( UInt32 h1 , UInt32 h2 )
{
return IntHash32Impl : : apply ( h1 ) ^ h2 ;
}
static constexpr bool use_int_hash_for_pods = false ;
} ;
struct MurmurHash2Impl64
{
static constexpr auto name = " murmurHash2_64 " ;
using ReturnType = UInt64 ;
static UInt64 apply ( const char * data , const size_t size )
{
return MurmurHash64A ( data , size , 0 ) ;
}
static UInt64 combineHashes ( UInt64 h1 , UInt64 h2 )
{
return IntHash64Impl : : apply ( h1 ) ^ h2 ;
}
static constexpr bool use_int_hash_for_pods = false ;
} ;
2019-01-08 05:12:52 +00:00
/// To be compatible with gcc: https://github.com/gcc-mirror/gcc/blob/41d6b10e96a1de98e90a7c0378437c3255814b16/libstdc%2B%2B-v3/include/bits/functional_hash.h#L191
2019-01-09 02:03:50 +00:00
struct GccMurmurHashImpl
2019-01-08 05:12:52 +00:00
{
2019-01-09 02:03:50 +00:00
static constexpr auto name = " gccMurmurHash " ;
2019-01-08 05:12:52 +00:00
using ReturnType = UInt64 ;
static UInt64 apply ( const char * data , const size_t size )
{
return MurmurHash64A ( data , size , 0xc70f6907UL ) ;
}
static UInt64 combineHashes ( UInt64 h1 , UInt64 h2 )
{
return IntHash64Impl : : apply ( h1 ) ^ h2 ;
}
static constexpr bool use_int_hash_for_pods = false ;
} ;
2018-11-01 15:47:08 +00:00
struct MurmurHash3Impl32
{
static constexpr auto name = " murmurHash3_32 " ;
using ReturnType = UInt32 ;
static UInt32 apply ( const char * data , const size_t size )
{
union
{
UInt32 h ;
char bytes [ sizeof ( h ) ] ;
} ;
MurmurHash3_x86_32 ( data , size , 0 , bytes ) ;
return h ;
}
static UInt32 combineHashes ( UInt32 h1 , UInt32 h2 )
{
return IntHash32Impl : : apply ( h1 ) ^ h2 ;
}
static constexpr bool use_int_hash_for_pods = false ;
} ;
struct MurmurHash3Impl64
{
static constexpr auto name = " murmurHash3_64 " ;
using ReturnType = UInt64 ;
static UInt64 apply ( const char * data , const size_t size )
{
union
{
UInt64 h [ 2 ] ;
char bytes [ 16 ] ;
} ;
MurmurHash3_x64_128 ( data , size , 0 , bytes ) ;
return h [ 0 ] ^ h [ 1 ] ;
}
static UInt64 combineHashes ( UInt64 h1 , UInt64 h2 )
{
return IntHash64Impl : : apply ( h1 ) ^ h2 ;
}
static constexpr bool use_int_hash_for_pods = false ;
} ;
2020-04-16 12:31:57 +00:00
struct MurmurHash3Impl128
{
static constexpr auto name = " murmurHash3_128 " ;
enum { length = 16 } ;
static void apply ( const char * begin , const size_t size , unsigned char * out_char_data )
{
MurmurHash3_x64_128 ( begin , size , 0 , out_char_data ) ;
}
} ;
# endif
2018-12-18 20:24:16 +00:00
/// http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452
/// Care should be taken to do all calculation in unsigned integers (to avoid undefined behaviour on overflow)
/// but obtain the same result as it is done in singed integers with two's complement arithmetic.
struct JavaHashImpl
{
static constexpr auto name = " javaHash " ;
using ReturnType = Int32 ;
static Int32 apply ( const char * data , const size_t size )
{
UInt32 h = 0 ;
for ( size_t i = 0 ; i < size ; + + i )
h = 31 * h + static_cast < UInt32 > ( static_cast < Int8 > ( data [ i ] ) ) ;
return static_cast < Int32 > ( h ) ;
}
static Int32 combineHashes ( Int32 , Int32 )
{
throw Exception ( " Java hash is not combineable for multiple arguments " , ErrorCodes : : NOT_IMPLEMENTED ) ;
}
static constexpr bool use_int_hash_for_pods = false ;
} ;
2019-11-06 10:46:37 +00:00
struct JavaHashUTF16LEImpl
{
static constexpr auto name = " javaHashUTF16LE " ;
using ReturnType = Int32 ;
static Int32 apply ( const char * raw_data , const size_t raw_size )
{
char * data = const_cast < char * > ( raw_data ) ;
size_t size = raw_size ;
// Remove Byte-order-mark(0xFFFE) for UTF-16LE
2019-11-06 14:17:38 +00:00
if ( size > = 2 & & data [ 0 ] = = ' \xFF ' & & data [ 1 ] = = ' \xFE ' )
2019-11-06 10:46:37 +00:00
{
data + = 2 ;
size - = 2 ;
}
if ( size % 2 ! = 0 )
2020-01-10 12:06:22 +00:00
throw Exception ( " Arguments for javaHashUTF16LE must be in the form of UTF-16 " , ErrorCodes : : BAD_ARGUMENTS ) ;
2019-11-06 10:46:37 +00:00
2019-11-06 15:00:59 +00:00
UInt32 h = 0 ;
2019-11-06 10:46:37 +00:00
for ( size_t i = 0 ; i < size ; i + = 2 )
2019-11-06 14:17:38 +00:00
h = 31 * h + static_cast < UInt16 > ( static_cast < UInt8 > ( data [ i ] ) | static_cast < UInt8 > ( data [ i + 1 ] ) < < 8 ) ;
2019-11-06 10:46:37 +00:00
2019-11-06 15:00:59 +00:00
return static_cast < Int32 > ( h ) ;
2019-11-06 10:46:37 +00:00
}
static Int32 combineHashes ( Int32 , Int32 )
{
throw Exception ( " Java hash is not combineable for multiple arguments " , ErrorCodes : : NOT_IMPLEMENTED ) ;
}
static constexpr bool use_int_hash_for_pods = false ;
} ;
2018-12-18 20:24:16 +00:00
/// This is just JavaHash with zeroed out sign bit.
/// This function is used in Hive for versions before 3.0,
/// after 3.0, Hive uses murmur-hash3.
struct HiveHashImpl
{
static constexpr auto name = " hiveHash " ;
using ReturnType = Int32 ;
static Int32 apply ( const char * data , const size_t size )
{
return static_cast < Int32 > ( 0x7FFFFFFF & static_cast < UInt32 > ( JavaHashImpl : : apply ( data , size ) ) ) ;
}
static Int32 combineHashes ( Int32 , Int32 )
{
throw Exception ( " Hive hash is not combineable for multiple arguments " , ErrorCodes : : NOT_IMPLEMENTED ) ;
}
static constexpr bool use_int_hash_for_pods = false ;
} ;
2018-11-01 15:47:08 +00:00
struct ImplCityHash64
{
static constexpr auto name = " cityHash64 " ;
using ReturnType = UInt64 ;
using uint128_t = CityHash_v1_0_2 : : uint128 ;
static auto combineHashes ( UInt64 h1 , UInt64 h2 ) { return CityHash_v1_0_2 : : Hash128to64 ( uint128_t ( h1 , h2 ) ) ; }
static auto apply ( const char * s , const size_t len ) { return CityHash_v1_0_2 : : CityHash64 ( s , len ) ; }
static constexpr bool use_int_hash_for_pods = true ;
} ;
// see farmhash.h for definition of NAMESPACE_FOR_HASH_FUNCTIONS
struct ImplFarmHash64
{
static constexpr auto name = " farmHash64 " ;
using ReturnType = UInt64 ;
using uint128_t = NAMESPACE_FOR_HASH_FUNCTIONS : : uint128_t ;
static auto combineHashes ( UInt64 h1 , UInt64 h2 ) { return NAMESPACE_FOR_HASH_FUNCTIONS : : Hash128to64 ( uint128_t ( h1 , h2 ) ) ; }
static auto apply ( const char * s , const size_t len ) { return NAMESPACE_FOR_HASH_FUNCTIONS : : Hash64 ( s , len ) ; }
static constexpr bool use_int_hash_for_pods = true ;
} ;
struct ImplMetroHash64
{
static constexpr auto name = " metroHash64 " ;
using ReturnType = UInt64 ;
using uint128_t = CityHash_v1_0_2 : : uint128 ;
static auto combineHashes ( UInt64 h1 , UInt64 h2 ) { return CityHash_v1_0_2 : : Hash128to64 ( uint128_t ( h1 , h2 ) ) ; }
static auto apply ( const char * s , const size_t len )
{
union
{
UInt64 u64 ;
2020-02-22 05:46:35 +00:00
uint8_t u8 [ sizeof ( u64 ) ] ;
2018-11-01 15:47:08 +00:00
} ;
2020-02-22 05:46:35 +00:00
metrohash64_1 ( reinterpret_cast < const uint8_t * > ( s ) , len , 0 , u8 ) ;
2018-11-01 15:47:08 +00:00
return u64 ;
}
static constexpr bool use_int_hash_for_pods = true ;
} ;
2018-12-23 19:25:40 +00:00
# if USE_XXHASH
2018-12-21 17:53:16 +00:00
struct ImplXxHash32
{
static constexpr auto name = " xxHash32 " ;
using ReturnType = UInt32 ;
static auto apply ( const char * s , const size_t len ) { return XXH32 ( s , len , 0 ) ; }
/**
* With current implementation with more than 1 arguments it will give the results
* non - reproducable from outside of CH .
*
* Proper way of combining several input is to use streaming mode of hash function
* https : //github.com/Cyan4973/xxHash/issues/114#issuecomment-334908566
*
* In common case doable by init_state / update_state / finalize_state
*/
static auto combineHashes ( UInt32 h1 , UInt32 h2 ) { return IntHash32Impl : : apply ( h1 ) ^ h2 ; }
static constexpr bool use_int_hash_for_pods = false ;
} ;
struct ImplXxHash64
{
static constexpr auto name = " xxHash64 " ;
using ReturnType = UInt64 ;
using uint128_t = CityHash_v1_0_2 : : uint128 ;
static auto apply ( const char * s , const size_t len ) { return XXH64 ( s , len , 0 ) ; }
/*
With current implementation with more than 1 arguments it will give the results
non - reproducable from outside of CH . ( see comment on ImplXxHash32 ) .
*/
static auto combineHashes ( UInt64 h1 , UInt64 h2 ) { return CityHash_v1_0_2 : : Hash128to64 ( uint128_t ( h1 , h2 ) ) ; }
static constexpr bool use_int_hash_for_pods = false ;
} ;
2018-12-23 19:25:40 +00:00
# endif
2018-11-01 15:47:08 +00:00
2014-10-29 12:25:33 +00:00
template < typename Impl >
class FunctionStringHashFixedString : public IFunction
{
public :
2017-04-01 07:20:54 +00:00
static constexpr auto name = Impl : : name ;
2018-06-03 20:39:06 +00:00
static FunctionPtr create ( const Context & ) { return std : : make_shared < FunctionStringHashFixedString > ( ) ; }
2017-04-01 07:20:54 +00:00
String getName ( ) const override
{
return name ;
}
size_t getNumberOfArguments ( ) const override { return 1 ; }
DataTypePtr getReturnTypeImpl ( const DataTypes & arguments ) const override
{
2020-06-23 05:35:35 +00:00
if ( ! isStringOrFixedString ( arguments [ 0 ] ) )
2017-04-01 07:20:54 +00:00
throw Exception ( " Illegal type " + arguments [ 0 ] - > getName ( ) + " of argument of function " + getName ( ) ,
ErrorCodes : : ILLEGAL_TYPE_OF_ARGUMENT ) ;
return std : : make_shared < DataTypeFixedString > ( Impl : : length ) ;
}
2017-07-23 08:40:43 +00:00
bool useDefaultImplementationForConstants ( ) const override { return true ; }
2018-04-24 07:16:39 +00:00
void executeImpl ( Block & block , const ColumnNumbers & arguments , size_t result , size_t /*input_rows_count*/ ) override
2017-04-01 07:20:54 +00:00
{
2017-07-21 06:35:58 +00:00
if ( const ColumnString * col_from = checkAndGetColumn < ColumnString > ( block . getByPosition ( arguments [ 0 ] ) . column . get ( ) ) )
2017-04-01 07:20:54 +00:00
{
2017-12-14 01:43:19 +00:00
auto col_to = ColumnFixedString : : create ( Impl : : length ) ;
2017-04-01 07:20:54 +00:00
2018-11-25 00:08:50 +00:00
const typename ColumnString : : Chars & data = col_from - > getChars ( ) ;
2017-12-15 21:32:25 +00:00
const typename ColumnString : : Offsets & offsets = col_from - > getOffsets ( ) ;
2017-04-01 07:20:54 +00:00
auto & chars_to = col_to - > getChars ( ) ;
const auto size = offsets . size ( ) ;
chars_to . resize ( size * Impl : : length ) ;
2018-07-30 18:00:16 +00:00
ColumnString : : Offset current_offset = 0 ;
2017-04-01 07:20:54 +00:00
for ( size_t i = 0 ; i < size ; + + i )
2018-07-30 18:00:16 +00:00
{
2017-04-01 07:20:54 +00:00
Impl : : apply (
2018-07-30 18:00:16 +00:00
reinterpret_cast < const char * > ( & data [ current_offset ] ) ,
offsets [ i ] - current_offset - 1 ,
2020-02-22 05:46:35 +00:00
reinterpret_cast < uint8_t * > ( & chars_to [ i * Impl : : length ] ) ) ;
2017-12-16 05:21:04 +00:00
2018-07-30 18:00:16 +00:00
current_offset = offsets [ i ] ;
}
2017-12-16 05:21:04 +00:00
block . getByPosition ( result ) . column = std : : move ( col_to ) ;
2017-04-01 07:20:54 +00:00
}
2020-06-23 05:35:35 +00:00
else if (
const ColumnFixedString * col_from_fix = checkAndGetColumn < ColumnFixedString > ( block . getByPosition ( arguments [ 0 ] ) . column . get ( ) ) )
{
auto col_to = ColumnFixedString : : create ( Impl : : length ) ;
const typename ColumnFixedString : : Chars & data = col_from_fix - > getChars ( ) ;
const auto size = col_from_fix - > size ( ) ;
auto & chars_to = col_to - > getChars ( ) ;
const auto length = col_from_fix - > getN ( ) ;
chars_to . resize ( size * Impl : : length ) ;
for ( size_t i = 0 ; i < size ; + + i )
{
Impl : : apply (
reinterpret_cast < const char * > ( & data [ i * length ] ) , length , reinterpret_cast < uint8_t * > ( & chars_to [ i * Impl : : length ] ) ) ;
}
block . getByPosition ( result ) . column = std : : move ( col_to ) ;
}
2017-04-01 07:20:54 +00:00
else
2017-07-21 06:35:58 +00:00
throw Exception ( " Illegal column " + block . getByPosition ( arguments [ 0 ] ) . column - > getName ( )
2017-04-01 07:20:54 +00:00
+ " of first argument of function " + getName ( ) ,
ErrorCodes : : ILLEGAL_COLUMN ) ;
}
2014-10-29 12:25:33 +00:00
} ;
2020-05-25 15:16:19 +00:00
DECLARE_MULTITARGET_CODE (
2012-07-15 21:43:04 +00:00
template < typename Impl , typename Name >
class FunctionIntHash : public IFunction
{
2014-11-12 17:23:26 +00:00
public :
2017-04-01 07:20:54 +00:00
static constexpr auto name = Name : : name ;
2014-11-12 17:23:26 +00:00
2012-07-15 21:43:04 +00:00
private :
2017-04-01 07:20:54 +00:00
using ToType = typename Impl : : ReturnType ;
template < typename FromType >
void executeType ( Block & block , const ColumnNumbers & arguments , size_t result )
{
2017-07-21 06:35:58 +00:00
if ( auto col_from = checkAndGetColumn < ColumnVector < FromType > > ( block . getByPosition ( arguments [ 0 ] ) . column . get ( ) ) )
2017-04-01 07:20:54 +00:00
{
2017-12-14 01:43:19 +00:00
auto col_to = ColumnVector < ToType > : : create ( ) ;
2017-04-01 07:20:54 +00:00
2017-12-15 21:32:25 +00:00
const typename ColumnVector < FromType > : : Container & vec_from = col_from - > getData ( ) ;
typename ColumnVector < ToType > : : Container & vec_to = col_to - > getData ( ) ;
2017-04-01 07:20:54 +00:00
size_t size = vec_from . size ( ) ;
vec_to . resize ( size ) ;
for ( size_t i = 0 ; i < size ; + + i )
vec_to [ i ] = Impl : : apply ( vec_from [ i ] ) ;
2017-12-16 05:21:04 +00:00
block . getByPosition ( result ) . column = std : : move ( col_to ) ;
2017-04-01 07:20:54 +00:00
}
else
2017-07-21 06:35:58 +00:00
throw Exception ( " Illegal column " + block . getByPosition ( arguments [ 0 ] ) . column - > getName ( )
2017-04-01 07:20:54 +00:00
+ " of first argument of function " + Name : : name ,
ErrorCodes : : ILLEGAL_COLUMN ) ;
}
2012-07-15 21:43:04 +00:00
public :
2017-04-01 07:20:54 +00:00
String getName ( ) const override
{
return name ;
}
size_t getNumberOfArguments ( ) const override { return 1 ; }
DataTypePtr getReturnTypeImpl ( const DataTypes & arguments ) const override
{
2018-11-01 15:47:08 +00:00
if ( ! arguments [ 0 ] - > isValueRepresentedByNumber ( ) )
2017-04-01 07:20:54 +00:00
throw Exception ( " Illegal type " + arguments [ 0 ] - > getName ( ) + " of argument of function " + getName ( ) ,
ErrorCodes : : ILLEGAL_TYPE_OF_ARGUMENT ) ;
return std : : make_shared < DataTypeNumber < typename Impl : : ReturnType > > ( ) ;
}
2017-07-23 08:40:43 +00:00
bool useDefaultImplementationForConstants ( ) const override { return true ; }
2018-04-24 07:16:39 +00:00
void executeImpl ( Block & block , const ColumnNumbers & arguments , size_t result , size_t /*input_rows_count*/ ) override
2017-04-01 07:20:54 +00:00
{
2017-12-18 01:11:48 +00:00
const IDataType * from_type = block . getByPosition ( arguments [ 0 ] ) . type . get ( ) ;
2018-09-07 14:37:26 +00:00
WhichDataType which ( from_type ) ;
if ( which . isUInt8 ( ) ) executeType < UInt8 > ( block , arguments , result ) ;
else if ( which . isUInt16 ( ) ) executeType < UInt16 > ( block , arguments , result ) ;
else if ( which . isUInt32 ( ) ) executeType < UInt32 > ( block , arguments , result ) ;
else if ( which . isUInt64 ( ) ) executeType < UInt64 > ( block , arguments , result ) ;
else if ( which . isInt8 ( ) ) executeType < Int8 > ( block , arguments , result ) ;
else if ( which . isInt16 ( ) ) executeType < Int16 > ( block , arguments , result ) ;
else if ( which . isInt32 ( ) ) executeType < Int32 > ( block , arguments , result ) ;
else if ( which . isInt64 ( ) ) executeType < Int64 > ( block , arguments , result ) ;
else if ( which . isDate ( ) ) executeType < UInt16 > ( block , arguments , result ) ;
else if ( which . isDateTime ( ) ) executeType < UInt32 > ( block , arguments , result ) ;
2017-04-01 07:20:54 +00:00
else
2017-07-21 06:35:58 +00:00
throw Exception ( " Illegal type " + block . getByPosition ( arguments [ 0 ] ) . type - > getName ( ) + " of argument of function " + getName ( ) ,
2017-04-01 07:20:54 +00:00
ErrorCodes : : ILLEGAL_TYPE_OF_ARGUMENT ) ;
}
2012-07-15 21:43:04 +00:00
} ;
2020-05-25 15:16:19 +00:00
) // DECLARE_MULTITARGET_CODE
template < typename Impl , typename Name >
class FunctionIntHash : public TargetSpecific : : Default : : FunctionIntHash < Impl , Name >
{
public :
2020-05-28 15:38:07 +00:00
explicit FunctionIntHash ( const Context & context ) : selector ( context )
2020-05-25 15:16:19 +00:00
{
selector . registerImplementation < TargetArch : : Default ,
TargetSpecific : : Default : : FunctionIntHash < Impl , Name > > ( ) ;
2020-05-28 11:48:56 +00:00
2020-05-26 15:56:46 +00:00
# if USE_MULTITARGET_CODE
2020-05-25 15:16:19 +00:00
selector . registerImplementation < TargetArch : : AVX2 ,
TargetSpecific : : AVX2 : : FunctionIntHash < Impl , Name > > ( ) ;
selector . registerImplementation < TargetArch : : AVX512F ,
TargetSpecific : : AVX512F : : FunctionIntHash < Impl , Name > > ( ) ;
2020-05-26 15:56:46 +00:00
# endif
2020-05-25 15:16:19 +00:00
}
void executeImpl ( Block & block , const ColumnNumbers & arguments , size_t result , size_t input_rows_count ) override
{
2020-05-28 11:48:56 +00:00
selector . selectAndExecute ( block , arguments , result , input_rows_count ) ;
2020-05-25 15:16:19 +00:00
}
2020-05-28 11:48:56 +00:00
2020-05-25 15:16:19 +00:00
static FunctionPtr create ( const Context & context )
{
return std : : make_shared < FunctionIntHash > ( context ) ;
}
private :
ImplementationSelector < IFunction > selector ;
} ;
2012-07-15 21:43:04 +00:00
2020-05-26 15:56:46 +00:00
DECLARE_MULTITARGET_CODE (
2015-07-17 15:56:08 +00:00
template < typename Impl >
2018-10-23 17:43:09 +00:00
class FunctionAnyHash : public IFunction
2014-07-03 11:24:01 +00:00
{
2014-11-12 17:23:26 +00:00
public :
2017-04-01 07:20:54 +00:00
static constexpr auto name = Impl : : name ;
2014-11-12 17:23:26 +00:00
2014-07-03 11:24:01 +00:00
private :
2018-10-23 17:43:09 +00:00
using ToType = typename Impl : : ReturnType ;
2019-11-13 14:20:23 +00:00
template < typename FromType , bool first >
2018-10-23 17:43:09 +00:00
void executeIntType ( const IColumn * column , typename ColumnVector < ToType > : : Container & vec_to )
2017-04-01 07:20:54 +00:00
{
2019-11-13 14:20:23 +00:00
if ( const ColumnVector < FromType > * col_from = checkAndGetColumn < ColumnVector < FromType > > ( column ) )
2017-04-01 07:20:54 +00:00
{
2019-11-13 14:20:23 +00:00
const typename ColumnVector < FromType > : : Container & vec_from = col_from - > getData ( ) ;
2017-04-01 07:20:54 +00:00
size_t size = vec_from . size ( ) ;
for ( size_t i = 0 ; i < size ; + + i )
{
2018-10-23 17:43:09 +00:00
ToType h ;
2018-11-01 15:47:08 +00:00
if constexpr ( Impl : : use_int_hash_for_pods )
{
if constexpr ( std : : is_same_v < ToType , UInt64 > )
h = IntHash64Impl : : apply ( ext : : bit_cast < UInt64 > ( vec_from [ i ] ) ) ;
else
h = IntHash32Impl : : apply ( ext : : bit_cast < UInt32 > ( vec_from [ i ] ) ) ;
}
2018-10-23 17:43:09 +00:00
else
2018-11-01 15:47:08 +00:00
{
h = Impl : : apply ( reinterpret_cast < const char * > ( & vec_from [ i ] ) , sizeof ( vec_from [ i ] ) ) ;
}
2017-04-01 07:20:54 +00:00
if ( first )
vec_to [ i ] = h ;
else
2018-10-24 13:12:59 +00:00
vec_to [ i ] = Impl : : combineHashes ( vec_to [ i ] , h ) ;
2017-04-01 07:20:54 +00:00
}
}
2019-11-13 14:20:23 +00:00
else if ( auto col_from_const = checkAndGetColumnConst < ColumnVector < FromType > > ( column ) )
2017-04-01 07:20:54 +00:00
{
2019-01-04 12:10:00 +00:00
auto value = col_from_const - > template getValue < FromType > ( ) ;
2018-10-23 17:43:09 +00:00
ToType hash ;
if constexpr ( std : : is_same_v < ToType , UInt64 > )
hash = IntHash64Impl : : apply ( ext : : bit_cast < UInt64 > ( value ) ) ;
else
hash = IntHash32Impl : : apply ( ext : : bit_cast < UInt32 > ( value ) ) ;
2017-04-01 07:20:54 +00:00
size_t size = vec_to . size ( ) ;
if ( first )
{
vec_to . assign ( size , hash ) ;
}
else
{
for ( size_t i = 0 ; i < size ; + + i )
2018-10-24 13:12:59 +00:00
vec_to [ i ] = Impl : : combineHashes ( vec_to [ i ] , hash ) ;
2017-04-01 07:20:54 +00:00
}
}
else
throw Exception ( " Illegal column " + column - > getName ( )
+ " of argument of function " + getName ( ) ,
ErrorCodes : : ILLEGAL_COLUMN ) ;
}
2019-11-12 16:37:36 +00:00
template < bool first >
2019-12-06 15:21:58 +00:00
void executeGeneric ( const IColumn * column , typename ColumnVector < ToType > : : Container & vec_to )
{
2019-12-06 19:21:22 +00:00
for ( size_t i = 0 , size = column - > size ( ) ; i < size ; + + i )
2019-12-06 15:21:58 +00:00
{
2019-12-06 16:27:45 +00:00
StringRef bytes = column - > getDataAt ( i ) ;
const ToType h = Impl : : apply ( bytes . data , bytes . size ) ;
2019-12-06 15:21:58 +00:00
if ( first )
2019-12-06 16:27:45 +00:00
vec_to [ i ] = h ;
2019-12-06 15:21:58 +00:00
else
2019-12-06 16:27:45 +00:00
vec_to [ i ] = Impl : : combineHashes ( vec_to [ i ] , h ) ;
2019-12-06 15:21:58 +00:00
}
}
2017-04-01 07:20:54 +00:00
template < bool first >
2018-10-23 17:43:09 +00:00
void executeString ( const IColumn * column , typename ColumnVector < ToType > : : Container & vec_to )
2017-04-01 07:20:54 +00:00
{
2017-07-21 06:35:58 +00:00
if ( const ColumnString * col_from = checkAndGetColumn < ColumnString > ( column ) )
2017-04-01 07:20:54 +00:00
{
2018-11-25 00:08:50 +00:00
const typename ColumnString : : Chars & data = col_from - > getChars ( ) ;
2017-12-15 21:32:25 +00:00
const typename ColumnString : : Offsets & offsets = col_from - > getOffsets ( ) ;
2017-04-01 07:20:54 +00:00
size_t size = offsets . size ( ) ;
2018-07-30 18:00:16 +00:00
ColumnString : : Offset current_offset = 0 ;
2017-04-01 07:20:54 +00:00
for ( size_t i = 0 ; i < size ; + + i )
{
2018-10-23 17:43:09 +00:00
const ToType h = Impl : : apply (
2018-07-30 18:00:16 +00:00
reinterpret_cast < const char * > ( & data [ current_offset ] ) ,
offsets [ i ] - current_offset - 1 ) ;
2017-04-01 07:20:54 +00:00
if ( first )
vec_to [ i ] = h ;
else
2018-10-24 13:12:59 +00:00
vec_to [ i ] = Impl : : combineHashes ( vec_to [ i ] , h ) ;
2018-07-30 18:00:16 +00:00
current_offset = offsets [ i ] ;
2017-04-01 07:20:54 +00:00
}
}
2019-01-04 12:10:00 +00:00
else if ( const ColumnFixedString * col_from_fixed = checkAndGetColumn < ColumnFixedString > ( column ) )
2017-04-01 07:20:54 +00:00
{
2019-01-04 12:10:00 +00:00
const typename ColumnString : : Chars & data = col_from_fixed - > getChars ( ) ;
size_t n = col_from_fixed - > getN ( ) ;
2017-04-01 07:20:54 +00:00
size_t size = data . size ( ) / n ;
2018-07-30 18:00:16 +00:00
2017-04-01 07:20:54 +00:00
for ( size_t i = 0 ; i < size ; + + i )
{
2018-10-23 17:43:09 +00:00
const ToType h = Impl : : apply ( reinterpret_cast < const char * > ( & data [ i * n ] ) , n ) ;
2017-04-01 07:20:54 +00:00
if ( first )
vec_to [ i ] = h ;
else
2018-10-24 13:12:59 +00:00
vec_to [ i ] = Impl : : combineHashes ( vec_to [ i ] , h ) ;
2017-04-01 07:20:54 +00:00
}
}
2019-01-04 12:10:00 +00:00
else if ( const ColumnConst * col_from_const = checkAndGetColumnConstStringOrFixedString ( column ) )
2017-04-01 07:20:54 +00:00
{
2019-01-04 12:10:00 +00:00
String value = col_from_const - > getValue < String > ( ) . data ( ) ;
2018-10-23 17:43:09 +00:00
const ToType hash = Impl : : apply ( value . data ( ) , value . size ( ) ) ;
2017-04-01 07:20:54 +00:00
const size_t size = vec_to . size ( ) ;
2018-07-30 18:00:16 +00:00
2017-04-01 07:20:54 +00:00
if ( first )
{
vec_to . assign ( size , hash ) ;
}
else
{
for ( size_t i = 0 ; i < size ; + + i )
{
2018-10-24 13:12:59 +00:00
vec_to [ i ] = Impl : : combineHashes ( vec_to [ i ] , hash ) ;
2017-04-01 07:20:54 +00:00
}
}
}
else
throw Exception ( " Illegal column " + column - > getName ( )
+ " of first argument of function " + getName ( ) ,
ErrorCodes : : ILLEGAL_COLUMN ) ;
}
template < bool first >
2018-10-23 17:43:09 +00:00
void executeArray ( const IDataType * type , const IColumn * column , typename ColumnVector < ToType > : : Container & vec_to )
2017-04-01 07:20:54 +00:00
{
2017-07-21 06:35:58 +00:00
const IDataType * nested_type = typeid_cast < const DataTypeArray * > ( type ) - > getNestedType ( ) . get ( ) ;
2017-04-01 07:20:54 +00:00
2017-07-21 06:35:58 +00:00
if ( const ColumnArray * col_from = checkAndGetColumn < ColumnArray > ( column ) )
2017-04-01 07:20:54 +00:00
{
const IColumn * nested_column = & col_from - > getData ( ) ;
2017-12-15 21:32:25 +00:00
const ColumnArray : : Offsets & offsets = col_from - > getOffsets ( ) ;
2017-04-01 07:20:54 +00:00
const size_t nested_size = nested_column - > size ( ) ;
2018-10-23 17:43:09 +00:00
typename ColumnVector < ToType > : : Container vec_temp ( nested_size ) ;
2017-04-01 07:20:54 +00:00
executeAny < true > ( nested_type , nested_column , vec_temp ) ;
const size_t size = offsets . size ( ) ;
2018-07-30 18:00:16 +00:00
ColumnArray : : Offset current_offset = 0 ;
2017-04-01 07:20:54 +00:00
for ( size_t i = 0 ; i < size ; + + i )
{
2018-07-30 18:00:16 +00:00
ColumnArray : : Offset next_offset = offsets [ i ] ;
2017-04-01 07:20:54 +00:00
2018-10-23 17:43:09 +00:00
ToType h ;
if constexpr ( std : : is_same_v < ToType , UInt64 > )
h = IntHash64Impl : : apply ( next_offset - current_offset ) ;
else
h = IntHash32Impl : : apply ( next_offset - current_offset ) ;
2017-04-01 07:20:54 +00:00
if ( first )
vec_to [ i ] = h ;
else
2018-10-24 13:12:59 +00:00
vec_to [ i ] = Impl : : combineHashes ( vec_to [ i ] , h ) ;
2017-04-01 07:20:54 +00:00
2018-07-30 18:00:16 +00:00
for ( size_t j = current_offset ; j < next_offset ; + + j )
2018-10-24 13:12:59 +00:00
vec_to [ i ] = Impl : : combineHashes ( vec_to [ i ] , vec_temp [ j ] ) ;
2018-07-30 18:00:16 +00:00
current_offset = offsets [ i ] ;
2017-04-01 07:20:54 +00:00
}
}
2019-01-04 12:10:00 +00:00
else if ( const ColumnConst * col_from_const = checkAndGetColumnConst < ColumnArray > ( column ) )
2017-04-01 07:20:54 +00:00
{
2017-05-27 15:45:25 +00:00
/// NOTE: here, of course, you can do without the materialization of the column.
2019-01-04 12:10:00 +00:00
ColumnPtr full_column = col_from_const - > convertToFullColumn ( ) ;
2017-04-01 07:20:54 +00:00
executeArray < first > ( type , & * full_column , vec_to ) ;
}
else
throw Exception ( " Illegal column " + column - > getName ( )
+ " of first argument of function " + getName ( ) ,
ErrorCodes : : ILLEGAL_COLUMN ) ;
}
template < bool first >
2018-10-23 17:43:09 +00:00
void executeAny ( const IDataType * from_type , const IColumn * icolumn , typename ColumnVector < ToType > : : Container & vec_to )
2017-04-01 07:20:54 +00:00
{
2018-09-07 14:37:26 +00:00
WhichDataType which ( from_type ) ;
2019-11-13 14:20:23 +00:00
if ( which . isUInt8 ( ) ) executeIntType < UInt8 , first > ( icolumn , vec_to ) ;
else if ( which . isUInt16 ( ) ) executeIntType < UInt16 , first > ( icolumn , vec_to ) ;
else if ( which . isUInt32 ( ) ) executeIntType < UInt32 , first > ( icolumn , vec_to ) ;
else if ( which . isUInt64 ( ) ) executeIntType < UInt64 , first > ( icolumn , vec_to ) ;
else if ( which . isInt8 ( ) ) executeIntType < Int8 , first > ( icolumn , vec_to ) ;
else if ( which . isInt16 ( ) ) executeIntType < Int16 , first > ( icolumn , vec_to ) ;
else if ( which . isInt32 ( ) ) executeIntType < Int32 , first > ( icolumn , vec_to ) ;
else if ( which . isInt64 ( ) ) executeIntType < Int64 , first > ( icolumn , vec_to ) ;
else if ( which . isEnum8 ( ) ) executeIntType < Int8 , first > ( icolumn , vec_to ) ;
else if ( which . isEnum16 ( ) ) executeIntType < Int16 , first > ( icolumn , vec_to ) ;
else if ( which . isDate ( ) ) executeIntType < UInt16 , first > ( icolumn , vec_to ) ;
else if ( which . isDateTime ( ) ) executeIntType < UInt32 , first > ( icolumn , vec_to ) ;
else if ( which . isFloat32 ( ) ) executeIntType < Float32 , first > ( icolumn , vec_to ) ;
else if ( which . isFloat64 ( ) ) executeIntType < Float64 , first > ( icolumn , vec_to ) ;
2018-09-07 14:37:26 +00:00
else if ( which . isString ( ) ) executeString < first > ( icolumn , vec_to ) ;
else if ( which . isFixedString ( ) ) executeString < first > ( icolumn , vec_to ) ;
else if ( which . isArray ( ) ) executeArray < first > ( from_type , icolumn , vec_to ) ;
2017-04-01 07:20:54 +00:00
else
2019-12-06 19:18:47 +00:00
executeGeneric < first > ( icolumn , vec_to ) ;
2017-04-01 07:20:54 +00:00
}
2019-11-13 15:18:24 +00:00
2018-10-23 17:43:09 +00:00
void executeForArgument ( const IDataType * type , const IColumn * column , typename ColumnVector < ToType > : : Container & vec_to , bool & is_first )
2017-04-01 07:20:54 +00:00
{
/// Flattening of tuples.
if ( const ColumnTuple * tuple = typeid_cast < const ColumnTuple * > ( column ) )
{
2019-03-25 01:43:54 +00:00
const auto & tuple_columns = tuple - > getColumns ( ) ;
2017-12-08 00:50:25 +00:00
const DataTypes & tuple_types = typeid_cast < const DataTypeTuple & > ( * type ) . getElements ( ) ;
size_t tuple_size = tuple_columns . size ( ) ;
for ( size_t i = 0 ; i < tuple_size ; + + i )
executeForArgument ( tuple_types [ i ] . get ( ) , tuple_columns [ i ] . get ( ) , vec_to , is_first ) ;
2017-04-01 07:20:54 +00:00
}
2019-01-04 12:10:00 +00:00
else if ( const ColumnTuple * tuple_const = checkAndGetColumnConstData < ColumnTuple > ( column ) )
2017-04-01 07:20:54 +00:00
{
2019-03-25 01:43:54 +00:00
const auto & tuple_columns = tuple_const - > getColumns ( ) ;
2017-12-09 12:23:09 +00:00
const DataTypes & tuple_types = typeid_cast < const DataTypeTuple & > ( * type ) . getElements ( ) ;
size_t tuple_size = tuple_columns . size ( ) ;
for ( size_t i = 0 ; i < tuple_size ; + + i )
{
2017-12-16 05:46:46 +00:00
auto tmp = ColumnConst : : create ( tuple_columns [ i ] , column - > size ( ) ) ;
executeForArgument ( tuple_types [ i ] . get ( ) , tmp . get ( ) , vec_to , is_first ) ;
2017-12-09 12:23:09 +00:00
}
2017-04-01 07:20:54 +00:00
}
else
{
if ( is_first )
executeAny < true > ( type , column , vec_to ) ;
else
executeAny < false > ( type , column , vec_to ) ;
}
is_first = false ;
}
2015-10-29 00:12:04 +00:00
2014-07-03 11:24:01 +00:00
public :
2017-04-01 07:20:54 +00:00
String getName ( ) const override
{
return name ;
}
bool isVariadic ( ) const override { return true ; }
size_t getNumberOfArguments ( ) const override { return 0 ; }
2017-12-17 10:51:19 +00:00
bool useDefaultImplementationForConstants ( ) const override { return true ; }
2017-04-01 07:20:54 +00:00
2017-12-02 02:47:12 +00:00
DataTypePtr getReturnTypeImpl ( const DataTypes & /*arguments*/ ) const override
2017-04-01 07:20:54 +00:00
{
2018-10-23 17:43:09 +00:00
return std : : make_shared < DataTypeNumber < ToType > > ( ) ;
2017-04-01 07:20:54 +00:00
}
2018-04-24 07:16:39 +00:00
void executeImpl ( Block & block , const ColumnNumbers & arguments , size_t result , size_t input_rows_count ) override
2017-04-01 07:20:54 +00:00
{
2018-04-24 07:16:39 +00:00
size_t rows = input_rows_count ;
2018-10-23 17:43:09 +00:00
auto col_to = ColumnVector < ToType > : : create ( rows ) ;
2017-04-01 07:20:54 +00:00
2018-10-23 17:43:09 +00:00
typename ColumnVector < ToType > : : Container & vec_to = col_to - > getData ( ) ;
2017-04-01 07:20:54 +00:00
if ( arguments . empty ( ) )
{
/// Constant random number from /dev/urandom is used as a hash value of empty list of arguments.
2018-10-23 17:43:09 +00:00
vec_to . assign ( rows , static_cast < ToType > ( 0xe28dbde7fe22e41c ) ) ;
2017-04-01 07:20:54 +00:00
}
2018-10-13 14:33:43 +00:00
/// The function supports arbitrary number of arguments of arbitrary types.
2017-04-01 07:20:54 +00:00
bool is_first_argument = true ;
for ( size_t i = 0 ; i < arguments . size ( ) ; + + i )
{
2017-07-21 06:35:58 +00:00
const ColumnWithTypeAndName & col = block . getByPosition ( arguments [ i ] ) ;
2017-04-01 07:20:54 +00:00
executeForArgument ( col . type . get ( ) , col . column . get ( ) , vec_to , is_first_argument ) ;
}
2017-12-17 10:51:19 +00:00
block . getByPosition ( result ) . column = std : : move ( col_to ) ;
2017-04-01 07:20:54 +00:00
}
2014-07-03 11:24:01 +00:00
} ;
2020-05-26 15:56:46 +00:00
) // DECLARE_MULTITARGET_CODE
template < typename Impl >
class FunctionAnyHash : public TargetSpecific : : Default : : FunctionAnyHash < Impl >
{
public :
2020-05-28 15:38:07 +00:00
explicit FunctionAnyHash ( const Context & context ) : selector ( context )
2020-05-26 15:56:46 +00:00
{
selector . registerImplementation < TargetArch : : Default ,
TargetSpecific : : Default : : FunctionAnyHash < Impl > > ( ) ;
# if USE_MULTITARGET_CODE
selector . registerImplementation < TargetArch : : AVX2 ,
TargetSpecific : : AVX2 : : FunctionAnyHash < Impl > > ( ) ;
selector . registerImplementation < TargetArch : : AVX512F ,
TargetSpecific : : AVX512F : : FunctionAnyHash < Impl > > ( ) ;
# endif
}
void executeImpl ( Block & block , const ColumnNumbers & arguments , size_t result , size_t input_rows_count ) override
{
selector . selectAndExecute ( block , arguments , result , input_rows_count ) ;
}
static FunctionPtr create ( const Context & context )
{
return std : : make_shared < FunctionAnyHash > ( context ) ;
}
private :
ImplementationSelector < IFunction > selector ;
} ;
2014-07-03 11:24:01 +00:00
2015-04-15 15:00:28 +00:00
struct URLHashImpl
{
2017-07-21 06:35:58 +00:00
static UInt64 apply ( const char * data , const size_t size )
2017-04-01 07:20:54 +00:00
{
/// do not take last slash, '?' or '#' character into account
if ( size > 0 & & ( data [ size - 1 ] = = ' / ' | | data [ size - 1 ] = = ' ? ' | | data [ size - 1 ] = = ' # ' ) )
2017-06-21 08:35:38 +00:00
return CityHash_v1_0_2 : : CityHash64 ( data , size - 1 ) ;
2017-04-01 07:20:54 +00:00
2017-06-21 08:35:38 +00:00
return CityHash_v1_0_2 : : CityHash64 ( data , size ) ;
2017-04-01 07:20:54 +00:00
}
2015-04-15 15:00:28 +00:00
} ;
struct URLHierarchyHashImpl
{
2017-07-21 06:35:58 +00:00
static size_t findLevelLength ( const UInt64 level , const char * begin , const char * end )
2017-04-01 07:20:54 +00:00
{
auto pos = begin ;
2017-05-27 15:45:25 +00:00
/// Let's parse everything that goes before the path
2017-04-01 07:20:54 +00:00
2017-05-27 15:45:25 +00:00
/// Suppose that the protocol has already been changed to lowercase.
2017-04-01 07:20:54 +00:00
while ( pos < end & & ( ( * pos > ' a ' & & * pos < ' z ' ) | | ( * pos > ' 0 ' & & * pos < ' 9 ' ) ) )
+ + pos ;
2017-05-27 15:45:25 +00:00
/** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes.
* ( http , file - fit , mailto , magnet - do not fit ) , and after two slashes there is still something
* For the rest , simply return the full URL as the only element of the hierarchy .
2017-04-01 07:20:54 +00:00
*/
if ( pos = = begin | | pos = = end | | ! ( * pos + + = = ' : ' & & pos < end & & * pos + + = = ' / ' & & pos < end & & * pos + + = = ' / ' & & pos < end ) )
{
pos = end ;
return 0 = = level ? pos - begin : 0 ;
}
2017-05-27 15:45:25 +00:00
/// The domain for simplicity is everything that after the protocol and the two slashes, until the next slash or before `?` or `#`
2017-04-01 07:20:54 +00:00
while ( pos < end & & ! ( * pos = = ' / ' | | * pos = = ' ? ' | | * pos = = ' # ' ) )
+ + pos ;
if ( pos ! = end )
+ + pos ;
if ( 0 = = level )
return pos - begin ;
UInt64 current_level = 0 ;
while ( current_level ! = level & & pos < end )
{
2017-05-27 15:45:25 +00:00
/// We go to the next `/` or `?` or `#`, skipping all at the beginning.
2017-04-01 07:20:54 +00:00
while ( pos < end & & ( * pos = = ' / ' | | * pos = = ' ? ' | | * pos = = ' # ' ) )
+ + pos ;
if ( pos = = end )
break ;
while ( pos < end & & ! ( * pos = = ' / ' | | * pos = = ' ? ' | | * pos = = ' # ' ) )
+ + pos ;
if ( pos ! = end )
+ + pos ;
+ + current_level ;
}
return current_level = = level ? pos - begin : 0 ;
}
2017-07-21 06:35:58 +00:00
static UInt64 apply ( const UInt64 level , const char * data , const size_t size )
2017-04-01 07:20:54 +00:00
{
return URLHashImpl : : apply ( data , findLevelLength ( level , data , data + size ) ) ;
}
2015-04-15 15:00:28 +00:00
} ;
class FunctionURLHash : public IFunction
{
public :
2017-04-01 07:20:54 +00:00
static constexpr auto name = " URLHash " ;
static FunctionPtr create ( const Context & ) { return std : : make_shared < FunctionURLHash > ( ) ; }
String getName ( ) const override { return name ; }
bool isVariadic ( ) const override { return true ; }
size_t getNumberOfArguments ( ) const override { return 0 ; }
DataTypePtr getReturnTypeImpl ( const DataTypes & arguments ) const override
{
const auto arg_count = arguments . size ( ) ;
if ( arg_count ! = 1 & & arg_count ! = 2 )
2018-05-07 02:01:11 +00:00
throw Exception { " Number of arguments for function " + getName ( ) + " doesn't match: passed " +
toString ( arg_count ) + " , should be 1 or 2. " , ErrorCodes : : NUMBER_OF_ARGUMENTS_DOESNT_MATCH } ;
2017-04-01 07:20:54 +00:00
const auto first_arg = arguments . front ( ) . get ( ) ;
2018-09-07 14:37:26 +00:00
if ( ! WhichDataType ( first_arg ) . isString ( ) )
2018-05-07 02:01:11 +00:00
throw Exception { " Illegal type " + first_arg - > getName ( ) + " of argument of function " + getName ( ) , ErrorCodes : : ILLEGAL_TYPE_OF_ARGUMENT } ;
2017-04-01 07:20:54 +00:00
if ( arg_count = = 2 )
{
2018-09-07 14:37:26 +00:00
const auto & second_arg = arguments . back ( ) ;
if ( ! isInteger ( second_arg ) )
2018-05-07 02:01:11 +00:00
throw Exception { " Illegal type " + second_arg - > getName ( ) + " of argument of function " + getName ( ) , ErrorCodes : : ILLEGAL_TYPE_OF_ARGUMENT } ;
2017-04-01 07:20:54 +00:00
}
return std : : make_shared < DataTypeUInt64 > ( ) ;
}
2017-07-23 08:40:43 +00:00
bool useDefaultImplementationForConstants ( ) const override { return true ; }
ColumnNumbers getArgumentsThatAreAlwaysConstant ( ) const override { return { 1 } ; }
2018-04-24 07:16:39 +00:00
void executeImpl ( Block & block , const ColumnNumbers & arguments , size_t result , size_t /*input_rows_count*/ ) override
2017-04-01 07:20:54 +00:00
{
const auto arg_count = arguments . size ( ) ;
if ( arg_count = = 1 )
executeSingleArg ( block , arguments , result ) ;
else if ( arg_count = = 2 )
executeTwoArgs ( block , arguments , result ) ;
else
2017-06-13 02:06:53 +00:00
throw Exception { " got into IFunction::execute with unexpected number of arguments " , ErrorCodes : : LOGICAL_ERROR } ;
2017-04-01 07:20:54 +00:00
}
2015-04-15 15:00:28 +00:00
private :
2017-07-21 06:35:58 +00:00
void executeSingleArg ( Block & block , const ColumnNumbers & arguments , const size_t result ) const
2017-04-01 07:20:54 +00:00
{
2017-07-21 06:35:58 +00:00
const auto col_untyped = block . getByPosition ( arguments . front ( ) ) . column . get ( ) ;
2017-04-01 07:20:54 +00:00
2017-07-21 06:35:58 +00:00
if ( const auto col_from = checkAndGetColumn < ColumnString > ( col_untyped ) )
2017-04-01 07:20:54 +00:00
{
const auto size = col_from - > size ( ) ;
2017-12-16 05:46:46 +00:00
auto col_to = ColumnUInt64 : : create ( size ) ;
2017-04-01 07:20:54 +00:00
const auto & chars = col_from - > getChars ( ) ;
const auto & offsets = col_from - > getOffsets ( ) ;
auto & out = col_to - > getData ( ) ;
2018-07-30 18:00:16 +00:00
ColumnString : : Offset current_offset = 0 ;
for ( size_t i = 0 ; i < size ; + + i )
{
2017-04-01 07:20:54 +00:00
out [ i ] = URLHashImpl : : apply (
2018-07-30 18:00:16 +00:00
reinterpret_cast < const char * > ( & chars [ current_offset ] ) ,
offsets [ i ] - current_offset - 1 ) ;
current_offset = offsets [ i ] ;
}
2017-12-16 05:21:04 +00:00
block . getByPosition ( result ) . column = std : : move ( col_to ) ;
2017-04-01 07:20:54 +00:00
}
else
2018-05-07 02:01:11 +00:00
throw Exception { " Illegal column " + block . getByPosition ( arguments [ 0 ] ) . column - > getName ( ) +
" of argument of function " + getName ( ) , ErrorCodes : : ILLEGAL_COLUMN } ;
2017-04-01 07:20:54 +00:00
}
2017-07-21 06:35:58 +00:00
void executeTwoArgs ( Block & block , const ColumnNumbers & arguments , const size_t result ) const
2017-04-01 07:20:54 +00:00
{
2017-07-21 06:35:58 +00:00
const auto level_col = block . getByPosition ( arguments . back ( ) ) . column . get ( ) ;
2019-06-27 19:28:52 +00:00
if ( ! isColumnConst ( * level_col ) )
2018-05-07 02:01:11 +00:00
throw Exception { " Second argument of function " + getName ( ) + " must be an integral constant " , ErrorCodes : : ILLEGAL_COLUMN } ;
2017-04-01 07:20:54 +00:00
const auto level = level_col - > get64 ( 0 ) ;
2017-07-21 06:35:58 +00:00
const auto col_untyped = block . getByPosition ( arguments . front ( ) ) . column . get ( ) ;
if ( const auto col_from = checkAndGetColumn < ColumnString > ( col_untyped ) )
2017-04-01 07:20:54 +00:00
{
const auto size = col_from - > size ( ) ;
2017-12-16 05:46:46 +00:00
auto col_to = ColumnUInt64 : : create ( size ) ;
2017-04-01 07:20:54 +00:00
const auto & chars = col_from - > getChars ( ) ;
const auto & offsets = col_from - > getOffsets ( ) ;
auto & out = col_to - > getData ( ) ;
2018-07-30 18:00:16 +00:00
ColumnString : : Offset current_offset = 0 ;
for ( size_t i = 0 ; i < size ; + + i )
{
out [ i ] = URLHierarchyHashImpl : : apply (
level ,
reinterpret_cast < const char * > ( & chars [ current_offset ] ) ,
offsets [ i ] - current_offset - 1 ) ;
current_offset = offsets [ i ] ;
}
2017-12-16 05:21:04 +00:00
block . getByPosition ( result ) . column = std : : move ( col_to ) ;
2017-04-01 07:20:54 +00:00
}
else
2018-05-07 02:01:11 +00:00
throw Exception { " Illegal column " + block . getByPosition ( arguments [ 0 ] ) . column - > getName ( ) +
" of argument of function " + getName ( ) , ErrorCodes : : ILLEGAL_COLUMN } ;
2017-04-01 07:20:54 +00:00
}
2015-04-15 15:00:28 +00:00
} ;
2017-06-13 02:06:53 +00:00
struct NameIntHash32 { static constexpr auto name = " intHash32 " ; } ;
struct NameIntHash64 { static constexpr auto name = " intHash64 " ; } ;
2018-08-02 09:21:26 +00:00
2019-03-22 11:18:24 +00:00
# if USE_SSL
2018-10-23 17:43:09 +00:00
using FunctionHalfMD5 = FunctionAnyHash < HalfMD5Impl > ;
2019-03-22 11:18:24 +00:00
# endif
2018-10-23 17:43:09 +00:00
using FunctionSipHash64 = FunctionAnyHash < SipHash64Impl > ;
2015-07-17 15:56:08 +00:00
using FunctionIntHash32 = FunctionIntHash < IntHash32Impl , NameIntHash32 > ;
using FunctionIntHash64 = FunctionIntHash < IntHash64Impl , NameIntHash64 > ;
2019-03-22 11:18:24 +00:00
# if USE_SSL
2015-07-17 15:56:08 +00:00
using FunctionMD5 = FunctionStringHashFixedString < MD5Impl > ;
using FunctionSHA1 = FunctionStringHashFixedString < SHA1Impl > ;
using FunctionSHA224 = FunctionStringHashFixedString < SHA224Impl > ;
using FunctionSHA256 = FunctionStringHashFixedString < SHA256Impl > ;
2019-03-22 11:18:24 +00:00
# endif
2015-07-17 15:56:08 +00:00
using FunctionSipHash128 = FunctionStringHashFixedString < SipHash128Impl > ;
2018-10-23 17:43:09 +00:00
using FunctionCityHash64 = FunctionAnyHash < ImplCityHash64 > ;
using FunctionFarmHash64 = FunctionAnyHash < ImplFarmHash64 > ;
using FunctionMetroHash64 = FunctionAnyHash < ImplMetroHash64 > ;
2020-04-16 12:31:57 +00:00
# if !defined(ARCADIA_BUILD)
2018-10-23 17:43:09 +00:00
using FunctionMurmurHash2_32 = FunctionAnyHash < MurmurHash2Impl32 > ;
using FunctionMurmurHash2_64 = FunctionAnyHash < MurmurHash2Impl64 > ;
2019-01-09 02:03:50 +00:00
using FunctionGccMurmurHash = FunctionAnyHash < GccMurmurHashImpl > ;
2018-10-23 17:43:09 +00:00
using FunctionMurmurHash3_32 = FunctionAnyHash < MurmurHash3Impl32 > ;
using FunctionMurmurHash3_64 = FunctionAnyHash < MurmurHash3Impl64 > ;
2018-08-02 09:21:26 +00:00
using FunctionMurmurHash3_128 = FunctionStringHashFixedString < MurmurHash3Impl128 > ;
2020-04-16 12:31:57 +00:00
# endif
2018-12-18 20:24:16 +00:00
using FunctionJavaHash = FunctionAnyHash < JavaHashImpl > ;
2019-11-06 10:46:37 +00:00
using FunctionJavaHashUTF16LE = FunctionAnyHash < JavaHashUTF16LEImpl > ;
2018-12-18 20:24:16 +00:00
using FunctionHiveHash = FunctionAnyHash < HiveHashImpl > ;
2018-12-21 17:53:16 +00:00
2018-12-23 19:25:40 +00:00
# if USE_XXHASH
using FunctionXxHash32 = FunctionAnyHash < ImplXxHash32 > ;
using FunctionXxHash64 = FunctionAnyHash < ImplXxHash64 > ;
# endif
2018-07-30 13:50:26 +00:00
}