diff --git a/dbms/src/Functions/FunctionsHashing.cpp b/dbms/src/Functions/FunctionsHashing.cpp index 90b1057ce63..4a98a7993b4 100644 --- a/dbms/src/Functions/FunctionsHashing.cpp +++ b/dbms/src/Functions/FunctionsHashing.cpp @@ -23,6 +23,7 @@ void registerFunctionsHashing(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/dbms/src/Functions/FunctionsHashing.h b/dbms/src/Functions/FunctionsHashing.h index 1b98191e224..2290e3b2050 100644 --- a/dbms/src/Functions/FunctionsHashing.h +++ b/dbms/src/Functions/FunctionsHashing.h @@ -353,6 +353,43 @@ struct JavaHashImpl static constexpr bool use_int_hash_for_pods = false; }; +struct JavaHashUTF16LEImpl +{ + static constexpr auto name = "javaHashUTF16LE"; + using ReturnType = Int32; + + static Int32 apply(const char * raw_data, const size_t raw_size) + { + char * data = const_cast(raw_data); + size_t size = raw_size; + + // Remove Byte-order-mark(0xFFFE) for UTF-16LE + if (size >= 2 && data[0] == -1 && data[1] == -2) + { + data += 2; + size -= 2; + } + + if (size % 2 != 0) + throw Exception("Arguments for javaHashUTF16LE must be in the form of UTF-16", ErrorCodes::LOGICAL_ERROR); + + UInt32 h = 0; + for (size_t i = 0; i < size; i += 2) + h = 31 * h + static_cast((data[i] & 0xFF) << HI_BYTE_SHIFT | (data[i+1] & 0xFF) << LO_BYTE_SHIFT); + + return static_cast(h); + } + + static Int32 combineHashes(Int32, Int32) + { + throw Exception("Java hash is not combineable for multiple arguments", ErrorCodes::NOT_IMPLEMENTED); + } + + static constexpr bool use_int_hash_for_pods = false; + static constexpr int HI_BYTE_SHIFT = 0; + static constexpr int LO_BYTE_SHIFT = 8; +}; + /// This is just JavaHash with zeroed out sign bit. /// This function is used in Hive for versions before 3.0, /// after 3.0, Hive uses murmur-hash3. @@ -1102,6 +1139,7 @@ using FunctionMurmurHash3_32 = FunctionAnyHash; using FunctionMurmurHash3_64 = FunctionAnyHash; using FunctionMurmurHash3_128 = FunctionStringHashFixedString; using FunctionJavaHash = FunctionAnyHash; +using FunctionJavaHashUTF16LE = FunctionAnyHash; using FunctionHiveHash = FunctionAnyHash; #if USE_XXHASH diff --git a/dbms/tests/queries/0_stateless/00800_function_java_hash.reference b/dbms/tests/queries/0_stateless/00800_function_java_hash.reference index 7f9c68298bd..6efefd41459 100644 --- a/dbms/tests/queries/0_stateless/00800_function_java_hash.reference +++ b/dbms/tests/queries/0_stateless/00800_function_java_hash.reference @@ -1,4 +1,7 @@ 96354 -676697544 +138768 +-2143570108 +2145564783 96354 1470786104 diff --git a/dbms/tests/queries/0_stateless/00800_function_java_hash.sql b/dbms/tests/queries/0_stateless/00800_function_java_hash.sql index c69cd412f57..2010b8d8311 100644 --- a/dbms/tests/queries/0_stateless/00800_function_java_hash.sql +++ b/dbms/tests/queries/0_stateless/00800_function_java_hash.sql @@ -1,4 +1,7 @@ select javaHash('abc'); select javaHash('874293087'); +select javaHashUTF16LE(convertCharset('a1가', 'utf-8', 'utf-16le')); +select javaHashUTF16LE(convertCharset('가나다라마바사아자차카타파하', 'utf-8', 'utf-16le')); +select javaHashUTF16LE(convertCharset('FJKLDSJFIOLD_389159837589429', 'utf-8', 'utf-16le')); select hiveHash('abc'); select hiveHash('874293087');