From f613c06c479f234d7894ef5cfe83a81317fc5c50 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sun, 5 May 2019 23:48:46 +0300 Subject: [PATCH] ngram vector_vector distance added --- .../Functions/FunctionsStringSimilarity.cpp | 137 +++++-- .../src/Functions/FunctionsStringSimilarity.h | 29 +- dbms/tests/performance/ngram_distance.xml | 5 +- .../00909_ngram_distance.reference | 346 ++++++++++++++++++ .../0_stateless/00909_ngram_distance.sql | 74 ++++ 5 files changed, 554 insertions(+), 37 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringSimilarity.cpp b/dbms/src/Functions/FunctionsStringSimilarity.cpp index 255301c00ed..d640cbf27e4 100644 --- a/dbms/src/Functions/FunctionsStringSimilarity.cpp +++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp @@ -164,43 +164,46 @@ struct NgramDistanceImpl return num; } + template static ALWAYS_INLINE inline size_t calculateNeedleStats( const char * data, const size_t size, NgramStats & ngram_stats, + [[maybe_unused]] UInt16 * ngram_storage, size_t (*read_code_points)(CodePoint *, const char *&, const char *), UInt16 (*hash_functor)(const CodePoint *)) { - // To prevent size_t overflow below. - if (size < N) - return 0; - const char * start = data; const char * end = data + size; CodePoint cp[simultaneously_codepoints_num] = {}; - /// read_code_points returns the position of cp where it stopped reading codepoints. size_t found = read_code_points(cp, start, end); /// We need to start for the first time here, because first N - 1 codepoints mean nothing. size_t i = N - 1; - /// Initialize with this value because for the first time `found` does not initialize first N - 1 codepoints. - size_t len = -N + 1; + size_t len = 0; do { - len += found - N + 1; for (; i + N <= found; ++i) - ++ngram_stats[hash_functor(cp + i)]; + { + ++len; + UInt16 hash = hash_functor(cp + i); + if constexpr (SaveNgrams) + *ngram_storage++ = hash; + ++ngram_stats[hash]; + } i = 0; } while (start < end && (found = read_code_points(cp, start, end))); return len; } + template static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric( const char * data, const size_t size, NgramStats & ngram_stats, size_t & distance, + [[maybe_unused]] UInt16 * ngram_storage, size_t (*read_code_points)(CodePoint *, const char *&, const char *), UInt16 (*hash_functor)(const CodePoint *)) { @@ -209,18 +212,6 @@ struct NgramDistanceImpl const char * end = data + size; CodePoint cp[simultaneously_codepoints_num] = {}; - /// allocation tricks, most strings are relatively small - static constexpr size_t small_buffer_size = 256; - std::unique_ptr big_buffer; - UInt16 small_buffer[small_buffer_size]; - UInt16 * ngram_storage = small_buffer; - - if (size > small_buffer_size) - { - ngram_storage = new UInt16[size]; - big_buffer.reset(ngram_storage); - } - /// read_code_points returns the position of cp where it stopped reading codepoints. size_t found = read_code_points(cp, start, end); /// We need to start for the first time here, because first N - 1 codepoints mean nothing. @@ -235,21 +226,25 @@ struct NgramDistanceImpl --distance; else ++distance; - - ngram_storage[ngram_cnt++] = hash; + if constexpr (ReuseStats) + ngram_storage[ngram_cnt] = hash; + ++ngram_cnt; --ngram_stats[hash]; } iter = 0; } while (start < end && (found = read_code_points(cp, start, end))); /// Return the state of hash map to its initial. - for (size_t i = 0; i < ngram_cnt; ++i) - ++ngram_stats[ngram_storage[i]]; + if constexpr (ReuseStats) + { + for (size_t i = 0; i < ngram_cnt; ++i) + ++ngram_stats[ngram_storage[i]]; + } return ngram_cnt; } template - static inline size_t dispatchSearcher(Callback callback, Args &&... args) + static inline auto dispatchSearcher(Callback callback, Args &&... args) { if constexpr (!UTF8) return callback(std::forward(args)..., readASCIICodePoints, ASCIIHash); @@ -268,11 +263,11 @@ struct NgramDistanceImpl needle.resize(needle_size + default_padding); data.resize(data_size + default_padding); - size_t second_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats); + size_t second_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats, nullptr); size_t distance = second_size; if (data_size <= max_string_size) { - size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance); + size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance, nullptr); res = distance * 1.f / std::max(first_size + second_size, size_t(1)); } else @@ -281,18 +276,94 @@ struct NgramDistanceImpl } } + static void vector_vector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offsets, + PaddedPODArray & res) + { + const size_t haystack_offsets_size = haystack_offsets.size(); + size_t prev_haystack_offset = 0; + size_t prev_needle_offset = 0; + + NgramStats common_stats; + memset(common_stats, 0, sizeof(common_stats)); + + /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb). + /// And we can reuse these storages in one thread because we care only about what was written to first places. + std::unique_ptr needle_ngram_storage; + std::unique_ptr haystack_ngram_storage; + needle_ngram_storage.reset(new UInt16[max_string_size]); + haystack_ngram_storage.reset(new UInt16[max_string_size]); + + for (size_t i = 0; i < haystack_offsets_size; ++i) + { + const char * haystack = reinterpret_cast(&haystack_data[prev_haystack_offset]); + const size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1; + const char * needle = reinterpret_cast(&needle_data[prev_needle_offset]); + const size_t needle_size = needle_offsets[i] - prev_needle_offset - 1; + + if (needle_size <= max_string_size && haystack_size <= max_string_size) + { + /// Get needle stats. + const size_t needle_stats_size = dispatchSearcher( + calculateNeedleStats, + needle, + needle_size, + common_stats, + needle_ngram_storage.get()); + + size_t distance = needle_stats_size; + + /// Combine with haystack stats, return to initial needle stats. + const size_t haystack_stats_size = dispatchSearcher( + calculateHaystackStatsAndMetric, + haystack, + haystack_size, + common_stats, + distance, + haystack_ngram_storage.get()); + + /// Return to zero array stats. + for (size_t j = 0; j < needle_stats_size; ++j) + --common_stats[needle_ngram_storage[j]]; + + /// For now, common stats is a zero array. + res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1)); + } + else + { + /// String are too big, we are assuming they are not the same. This is done because of limiting number + /// of bigrams added and not allocating too much memory. + res[i] = 1.f; + } + + prev_needle_offset = needle_offsets[i]; + prev_haystack_offset = haystack_offsets[i]; + } + } + static void vector_constant( - const ColumnString::Chars & data, const ColumnString::Offsets & offsets, std::string needle, PaddedPODArray & res) + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + std::string needle, + PaddedPODArray & res) { /// zeroing our map NgramStats common_stats; memset(common_stats, 0, sizeof(common_stats)); + /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb). + /// And we can reuse these storages in one thread because we care only about what was written to first places. + std::unique_ptr ngram_storage; + ngram_storage.reset(new UInt16[max_string_size]); + /// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case. const size_t needle_size = needle.size(); needle.resize(needle_size + default_padding); - const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats); + const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats, nullptr); size_t distance = needle_stats_size; size_t prev_offset = 0; @@ -303,7 +374,11 @@ struct NgramDistanceImpl if (haystack_size <= max_string_size) { size_t haystack_stats_size = dispatchSearcher( - calculateHaystackStatsAndMetric, reinterpret_cast(haystack), haystack_size, common_stats, distance); + calculateHaystackStatsAndMetric, + reinterpret_cast(haystack), + haystack_size, common_stats, + distance, + ngram_storage.get()); res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1)); } else diff --git a/dbms/src/Functions/FunctionsStringSimilarity.h b/dbms/src/Functions/FunctionsStringSimilarity.h index 7a106980722..9fd11e47622 100644 --- a/dbms/src/Functions/FunctionsStringSimilarity.h +++ b/dbms/src/Functions/FunctionsStringSimilarity.h @@ -62,10 +62,7 @@ public: const ColumnConst * col_haystack_const = typeid_cast(&*column_haystack); const ColumnConst * col_needle_const = typeid_cast(&*column_needle); - if (!col_needle_const) - throw Exception("Second argument of function " + getName() + " must be constant string.", ErrorCodes::ILLEGAL_COLUMN); - - if (col_haystack_const) + if (col_haystack_const && col_needle_const) { ResultType res{}; const String & needle = col_needle_const->getValue(); @@ -88,8 +85,9 @@ public: vec_res.resize(column_haystack->size()); const ColumnString * col_haystack_vector = checkAndGetColumn(&*column_haystack); + const ColumnString * col_needle_vector = checkAndGetColumn(&*column_needle); - if (col_haystack_vector) + if (col_haystack_vector && col_needle_const) { const String & needle = col_needle_const->getValue(); if (needle.size() > Impl::max_string_size) @@ -101,6 +99,27 @@ public: } Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), needle, vec_res); } + else if (col_haystack_vector && col_needle_vector) + { + Impl::vector_vector( + col_haystack_vector->getChars(), + col_haystack_vector->getOffsets(), + col_needle_vector->getChars(), + col_needle_vector->getOffsets(), + vec_res); + } + else if (col_haystack_const && col_needle_vector) + { + const String & needle = col_haystack_const->getValue(); + if (needle.size() > Impl::max_string_size) + { + throw Exception( + "String size of needle is too big for function " + getName() + ". Should be at most " + + std::to_string(Impl::max_string_size), + ErrorCodes::TOO_LARGE_STRING_SIZE); + } + Impl::vector_constant(col_needle_vector->getChars(), col_needle_vector->getOffsets(), needle, vec_res); + } else { throw Exception( diff --git a/dbms/tests/performance/ngram_distance.xml b/dbms/tests/performance/ngram_distance.xml index 16960811067..af1d81e6f61 100644 --- a/dbms/tests/performance/ngram_distance.xml +++ b/dbms/tests/performance/ngram_distance.xml @@ -21,13 +21,16 @@ 60000 - + SELECT DISTINCT URL,Title, ngramDistance(Title, URL) AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT SearchPhrase,Title, ngramDistance(Title, SearchPhrase) AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistance(Title, 'what is love') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistance(Title, 'baby dont hurt me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistance(Title, 'no more') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'wHAt Is lovE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'BABY DonT hUrT me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'nO MOrE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL,Title, ngramDistanceUTF8(Title, URL) AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT SearchPhrase,Title, ngramDistanceUTF8(Title, SearchPhrase) AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'чем занимаешься') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 diff --git a/dbms/tests/queries/0_stateless/00909_ngram_distance.reference b/dbms/tests/queries/0_stateless/00909_ngram_distance.reference index 356cc5db466..4fc02992bdf 100644 --- a/dbms/tests/queries/0_stateless/00909_ngram_distance.reference +++ b/dbms/tests/queries/0_stateless/00909_ngram_distance.reference @@ -33,6 +33,76 @@ 1000 1000 1000 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 0 1000 1000 @@ -40,6 +110,39 @@ 77 636 1000 +привет как дела?... Херсон 0 +привет как дела клип - Яндекс.Видео 0 +привет 0 +пап привет как дела - Яндекс.Видео 0 +привет братан как дела - Яндекс.Видео 0 +http://metric.ru/ 0 +http://autometric.ru/ 0 +http://metrica.yandex.com/ 0 +http://metris.ru/ 0 +http://metrika.ru/ 0 + 0 + 0 +привет как дела?... Херсон 600 +пап привет как дела - Яндекс.Видео 684 +привет как дела клип - Яндекс.Видео 692 +привет братан как дела - Яндекс.Видео 707 +привет 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 0 +http://metric.ru/ 765 +http://metris.ru/ 765 +http://metrika.ru/ 778 +http://autometric.ru/ 810 +http://metrica.yandex.com/ 846 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 привет как дела?... Херсон 297 пап привет как дела - Яндекс.Видео 422 привет как дела клип - Яндекс.Видео 435 @@ -152,6 +255,76 @@ http://metrika.ru/ 1000 1000 1000 1000 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 0 1000 1000 @@ -159,6 +332,39 @@ http://metrika.ru/ 1000 77 636 1000 +привет как дела?... Херсон 0 +привет как дела клип - Яндекс.Видео 0 +привет 0 +пап привет как дела - Яндекс.Видео 0 +привет братан как дела - Яндекс.Видео 0 +http://metric.ru/ 0 +http://autometric.ru/ 0 +http://metrica.yandex.com/ 0 +http://metris.ru/ 0 +http://metrika.ru/ 0 + 0 + 0 +привет как дела?... Херсон 600 +пап привет как дела - Яндекс.Видео 684 +привет как дела клип - Яндекс.Видео 692 +привет братан как дела - Яндекс.Видео 707 +привет 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 0 +http://metric.ru/ 765 +http://metris.ru/ 765 +http://metrika.ru/ 778 +http://autometric.ru/ 810 +http://metrica.yandex.com/ 846 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 привет как дела?... Херсон 297 пап привет как дела - Яндекс.Видео 422 привет как дела клип - Яндекс.Видео 435 @@ -293,6 +499,76 @@ http://metrika.ru/ 1000 1000 1000 1000 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 0 0 0 @@ -412,6 +688,76 @@ http://metrika.ru/ 1000 1000 1000 1000 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 0 0 0 diff --git a/dbms/tests/queries/0_stateless/00909_ngram_distance.sql b/dbms/tests/queries/0_stateless/00909_ngram_distance.sql index ea7bd252fc7..ed800bf6c97 100644 --- a/dbms/tests/queries/0_stateless/00909_ngram_distance.sql +++ b/dbms/tests/queries/0_stateless/00909_ngram_distance.sql @@ -6,6 +6,22 @@ select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абв select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5; select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize(''), materialize('')))=round(1000 * ngramDistanceUTF8(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абв'), materialize('')))=round(1000 * ngramDistanceUTF8(materialize('абв'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize(''), materialize('абв')))=round(1000 * ngramDistanceUTF8(materialize(''), 'абв')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), materialize('абвгдеёжз')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёжз')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), materialize('абвгдеёж')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёж')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), materialize('гдеёзд')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), materialize('ёёёёёёёё')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5; + +select round(1000 * ngramDistanceUTF8('', materialize('')))=round(1000 * ngramDistanceUTF8(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8('абв', materialize('')))=round(1000 * ngramDistanceUTF8(materialize('абв'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8('', materialize('абв')))=round(1000 * ngramDistanceUTF8(materialize(''), 'абв')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8('абвгдеёжз', materialize('абвгдеёжз')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёжз')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8('абвгдеёжз', materialize('абвгдеёж')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёж')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8('абвгдеёжз', materialize('гдеёзд')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8('абвгдеёжз', materialize('ёёёёёёёё')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5; + select round(1000 * ngramDistanceUTF8('', '')); select round(1000 * ngramDistanceUTF8('абв', '')); select round(1000 * ngramDistanceUTF8('', 'абв')); @@ -18,6 +34,10 @@ drop table if exists test_distance; create table test_distance (Title String) engine = Memory; insert into test_distance values ('привет как дела?... Херсон'), ('привет как дела клип - Яндекс.Видео'), ('привет'), ('пап привет как дела - Яндекс.Видео'), ('привет братан как дела - Яндекс.Видео'), ('http://metric.ru/'), ('http://autometric.ru/'), ('http://metrica.yandex.com/'), ('http://metris.ru/'), ('http://metrika.ru/'), (''); +SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, Title) as distance; +SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, extract(Title, 'как дела')) as distance; +SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, extract(Title, 'metr')) as distance; + SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, 'привет как дела') as distance; SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, 'как привет дела') as distance; SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, 'metrika') as distance; @@ -35,6 +55,23 @@ select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеё select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'гдеёЗД')) from system.numbers limit 5; select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'ЁЁЁЁЁЁЁЁ')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''),materialize(''))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абв'),materialize(''))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абв'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), materialize('абв'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), 'абв')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвГДЕёжз'), materialize('АбвгдЕёжз'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвГДЕёжз'), 'АбвгдЕёжз')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеёЖз'), materialize('АбвГдеёж'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеёЖз'), 'АбвГдеёж')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), materialize('гдеёЗД'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'гдеёЗД')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), materialize('ЁЁЁЁЁЁЁЁ'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'ЁЁЁЁЁЁЁЁ')) from system.numbers limit 5; + +select round(1000 * ngramDistanceCaseInsensitiveUTF8('', materialize(''))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8('абв',materialize(''))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абв'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8('', materialize('абв'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), 'абв')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвГДЕёжз', materialize('АбвгдЕёжз'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвГДЕёжз'), 'АбвгдЕёжз')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8('аБВГдеёЖз', materialize('АбвГдеёж'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеёЖз'), 'АбвГдеёж')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвгдеёжз', materialize('гдеёЗД'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'гдеёЗД')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвгдеёжз', materialize('ЁЁЁЁЁЁЁЁ'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'ЁЁЁЁЁЁЁЁ')) from system.numbers limit 5; + + select round(1000 * ngramDistanceCaseInsensitiveUTF8('', '')); select round(1000 * ngramDistanceCaseInsensitiveUTF8('абв', '')); select round(1000 * ngramDistanceCaseInsensitiveUTF8('', 'абв')); @@ -43,6 +80,10 @@ select round(1000 * ngramDistanceCaseInsensitiveUTF8('аБВГдеёЖз', 'Аб select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвгдеёжз', 'гдеёЗД')); select round(1000 * ngramDistanceCaseInsensitiveUTF8('АБВГДеёжз', 'ЁЁЁЁЁЁЁЁ')); +SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, Title) as distance; +SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, extract(Title, 'как дела')) as distance; +SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, extract(Title, 'metr')) as distance; + SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'ПрИвЕт кАК ДЕЛа') as distance; SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'как ПРИВЕТ дела') as distance; SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metrika') as distance; @@ -62,6 +103,23 @@ select round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefg')) from syst select round(1000 * ngramDistance(materialize('abcdefgh'), 'defgh')) from system.numbers limit 5; select round(1000 * ngramDistance(materialize('abcdefgh'), 'aaaaaaaa')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize(''),materialize('')))=round(1000 * ngramDistance(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abc'),materialize('')))=round(1000 * ngramDistance(materialize('abc'), '')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize(''), materialize('abc')))=round(1000 * ngramDistance(materialize(''), 'abc')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), materialize('abcdefgh')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefgh')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), materialize('abcdefg')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefg')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), materialize('defgh')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'defgh')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), materialize('aaaaaaaa')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'aaaaaaaa')) from system.numbers limit 5; + +select round(1000 * ngramDistance('',materialize('')))=round(1000 * ngramDistance(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistance('abc', materialize('')))=round(1000 * ngramDistance(materialize('abc'), '')) from system.numbers limit 5; +select round(1000 * ngramDistance('', materialize('abc')))=round(1000 * ngramDistance(materialize(''), 'abc')) from system.numbers limit 5; +select round(1000 * ngramDistance('abcdefgh', materialize('abcdefgh')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefgh')) from system.numbers limit 5; +select round(1000 * ngramDistance('abcdefgh', materialize('abcdefg')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefg')) from system.numbers limit 5; +select round(1000 * ngramDistance('abcdefgh', materialize('defgh')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'defgh')) from system.numbers limit 5; +select round(1000 * ngramDistance('abcdefgh', materialize('aaaaaaaa')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'aaaaaaaa')) from system.numbers limit 5; + + select round(1000 * ngramDistance('', '')); select round(1000 * ngramDistance('abc', '')); select round(1000 * ngramDistance('', 'abc')); @@ -86,6 +144,22 @@ select round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), 'abcde select round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), 'defgh')) from system.numbers limit 5; select round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), 'aaaaaaaa')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize(''), materialize('')))=round(1000 * ngramDistanceCaseInsensitive(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('abc'), materialize('')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abc'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize(''), materialize('abc')))=round(1000 * ngramDistanceCaseInsensitive(materialize(''), 'abc')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('abCdefgH'), materialize('Abcdefgh')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abCdefgH'), 'Abcdefgh')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), materialize('abcdeFG')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), 'abcdeFG')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), materialize('defgh')))=round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), 'defgh')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), materialize('aaaaaaaa')))=round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), 'aaaaaaaa')) from system.numbers limit 5; + +select round(1000 * ngramDistanceCaseInsensitive('', materialize('')))=round(1000 * ngramDistanceCaseInsensitive(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive('abc', materialize('')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abc'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive('', materialize('abc')))=round(1000 * ngramDistanceCaseInsensitive(materialize(''), 'abc')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive('abCdefgH', materialize('Abcdefgh')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abCdefgH'), 'Abcdefgh')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive('abcdefgh', materialize('abcdeFG')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), 'abcdeFG')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive('AAAAbcdefgh', materialize('defgh')))=round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), 'defgh')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive('ABCdefgH', materialize('aaaaaaaa')))=round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), 'aaaaaaaa')) from system.numbers limit 5; + select round(1000 * ngramDistanceCaseInsensitive('', '')); select round(1000 * ngramDistanceCaseInsensitive('abc', '')); select round(1000 * ngramDistanceCaseInsensitive('', 'abc'));