#include #include #include #include #include #include #include #ifdef __SSE4_2__ # include #endif namespace DB { namespace ErrorCodes { extern const int TOO_LARGE_STRING_SIZE; } template struct FunctionStringDistanceImpl { using ResultType = typename Op::ResultType; static void constantConstant(const std::string & haystack, const std::string & needle, ResultType & res) { res = Op::process(haystack.data(), haystack.size(), needle.data(), needle.size()); } static void vectorVector( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offsets, PaddedPODArray & res) { size_t size = res.size(); const char * haystack = reinterpret_cast(haystack_data.data()); const char * needle = reinterpret_cast(needle_data.data()); for (size_t i = 0; i < size; ++i) { res[i] = Op::process( haystack + haystack_offsets[i - 1], haystack_offsets[i] - haystack_offsets[i - 1] - 1, needle + needle_offsets[i - 1], needle_offsets[i] - needle_offsets[i - 1] - 1); } } static void constantVector( const std::string & haystack, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offsets, PaddedPODArray & res) { const char * haystack_data = haystack.data(); size_t haystack_size = haystack.size(); const char * needle = reinterpret_cast(needle_data.data()); size_t size = res.size(); for (size_t i = 0; i < size; ++i) { res[i] = Op::process(haystack_data, haystack_size, needle + needle_offsets[i - 1], needle_offsets[i] - needle_offsets[i - 1] - 1); } } static void vectorConstant( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray & res) { constantVector(needle, data, offsets, res); } }; struct ByteHammingDistanceImpl { using ResultType = UInt64; static ResultType inline process( const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size) { UInt64 res = 0; const char * haystack_end = haystack + haystack_size; const char * needle_end = needle + needle_size; #ifdef __SSE4_2__ static constexpr auto mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY; const char * haystack_end16 = haystack + haystack_size / 16 * 16; const char * needle_end16 = needle + needle_size / 16 * 16; for (; haystack < haystack_end16 && needle < needle_end16; haystack += 16, needle += 16) { __m128i s1 = _mm_loadu_si128(reinterpret_cast(haystack)); __m128i s2 = _mm_loadu_si128(reinterpret_cast(needle)); auto result_mask = _mm_cmpestrm(s1, 16, s2, 16, mode); const __m128i mask_hi = _mm_unpackhi_epi64(result_mask, result_mask); res += _mm_popcnt_u64(_mm_cvtsi128_si64(result_mask)) + _mm_popcnt_u64(_mm_cvtsi128_si64(mask_hi)); } #endif for (; haystack != haystack_end && needle != needle_end; ++haystack, ++needle) res += *haystack != *needle; res = res + (haystack_end - haystack) + (needle_end - needle); return res; } }; struct ByteEditDistanceImpl { using ResultType = UInt64; static constexpr size_t max_string_size = 1u << 16; static ResultType inline process( const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size) { if (haystack_size == 0 || needle_size == 0) return haystack_size + needle_size; /// Safety threshold against DoS, since we use two array to calculate the distance. if (haystack_size > max_string_size || needle_size > max_string_size) throw Exception( ErrorCodes::TOO_LARGE_STRING_SIZE, "The string size is too big for function byteEditDistance. " "Should be at most {}", max_string_size); PaddedPODArray distances0(haystack_size + 1, 0); PaddedPODArray distances1(haystack_size + 1, 0); ResultType substitution = 0; ResultType insertion = 0; ResultType deletion = 0; for (size_t i = 0; i <= haystack_size; ++i) distances0[i] = i; for (size_t pos_needle = 0; pos_needle < needle_size; ++pos_needle) { distances1[0] = pos_needle + 1; for (size_t pos_haystack = 0; pos_haystack < haystack_size; pos_haystack++) { deletion = distances0[pos_haystack + 1] + 1; insertion = distances1[pos_haystack] + 1; substitution = distances0[pos_haystack]; if (*(needle + pos_needle) != *(haystack + pos_haystack)) substitution += 1; distances1[pos_haystack + 1] = std::min(deletion, std::min(substitution, insertion)); } distances0.swap(distances1); } return distances0[haystack_size]; } }; struct NameByteHammingDistance { static constexpr auto name = "byteHammingDistance"; }; struct NameEditDistance { static constexpr auto name = "editDistance"; }; using FunctionByteHammingDistance = FunctionsStringSimilarity, NameByteHammingDistance>; using FunctionByteEditDistance = FunctionsStringSimilarity, NameEditDistance>; REGISTER_FUNCTION(StringDistance) { factory.registerFunction( FunctionDocumentation{.description = R"(Calculates Hamming distance between two byte-strings.)"}); factory.registerAlias("mismatches", NameByteHammingDistance::name); factory.registerFunction( FunctionDocumentation{.description = R"(Calculates the edit distance between two byte-strings.)"}); factory.registerAlias("levenshteinDistance", NameEditDistance::name); } }