Merge pull request #5807 from yandex/ngramSearch

Inverting ngramSearch to be more intuitive
2024-11-22 23:52:03 +00:00 · 2019-07-01 00:43:17 +03:00 · 2019-07-01 00:43:17 +03:00 · a9cfe4ce91
commit a9cfe4ce91
parent 1119056b71 9127c8b27c
5 changed files with 790 additions and 772 deletions
--- a/dbms/src/Functions/FunctionsStringSimilarity.cpp
+++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp
@ -271,11 +271,17 @@ struct NgramDistanceImpl
        {
            size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr);
            /// For !Symmetric version we should not use first_size.
-            res = distance * 1.f / std::max(Symmetric * first_size + second_size, size_t(1));
+            if constexpr (Symmetric)
+                res = distance * 1.f / std::max(first_size + second_size, size_t(1));
+            else
+                res = 1.f - distance * 1.f / std::max(second_size, size_t(1));
        }
        else
        {
-            res = 1.f;
+            if constexpr (Symmetric)
+                res = 1.f;
+            else
+                res = 0.f;
        }
    }

@ -333,13 +339,19 @@ struct NgramDistanceImpl


                /// For !Symmetric version we should not use haystack_stats_size.
-                res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1));
+                if constexpr (Symmetric)
+                    res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
+                else
+                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
            }
            else
            {
                /// Strings are too big, we are assuming they are not the same. This is done because of limiting number
                /// of bigrams added and not allocating too much memory.
-                res[i] = 1.f;
+                if constexpr (Symmetric)
+                    res[i] = 1.f;
+                else
+                    res[i] = 0.f;
            }

            prev_needle_offset = needle_offsets[i];
@ -399,11 +411,11 @@ struct NgramDistanceImpl
                    for (size_t j = 0; j < needle_stats_size; ++j)
                        --common_stats[needle_ngram_storage[j]];

-                    res[i] = distance * 1.f / std::max(needle_stats_size, size_t(1));
+                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
                }
                else
                {
-                    res[i] = 1.f;
+                    res[i] = 0.f;
                }

                prev_offset = needle_offsets[i];
@ -446,12 +458,18 @@ struct NgramDistanceImpl
                    distance,
                    ngram_storage.get());
                /// For !Symmetric version we should not use haystack_stats_size.
-                res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1));
+                if constexpr (Symmetric)
+                    res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
+                else
+                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
            }
            else
            {
                /// if the strings are too big, we say they are completely not the same
-                res[i] = 1.f;
+                if constexpr (Symmetric)
+                    res[i] = 1.f;
+                else
+                    res[i] = 0.f;
            }
            distance = needle_stats_size;
            prev_offset = offsets[i];
--- a/dbms/tests/queries/0_stateless/00951_ngram_search.reference
+++ b/dbms/tests/queries/0_stateless/00951_ngram_search.reference
--- a/dbms/tests/queries/0_stateless/00951_ngram_search.sql
+++ b/dbms/tests/queries/0_stateless/00951_ngram_search.sql
--- a/docs/en/query_language/functions/string_search_functions.md
+++ b/docs/en/query_language/functions/string_search_functions.md
@ -108,7 +108,7 @@ For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceC

 ## ngramSearch(haystack, needle)

-Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. Can be useful for fuzzy string search.
+Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. The closer to one, the more likely `needle` is in the `haystack`. Can be useful for fuzzy string search.

 For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.

--- a/docs/ru/query_language/functions/string_search_functions.md
+++ b/docs/ru/query_language/functions/string_search_functions.md
@ -97,7 +97,7 @@

 ## ngramSearch(haystack, needle)

-То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Может быть использовано для приближенного поиска.
+То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Чем ближе результат к единице, тем вероятнее, что `needle` внутри `haystack`. Может быть использовано для приближенного поиска.

 Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.