Merge pull request #5807 from yandex/ngramSearch

Inverting ngramSearch to be more intuitive
This commit is contained in:
alexey-milovidov 2019-07-01 00:43:17 +03:00 committed by GitHub
commit a9cfe4ce91
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 790 additions and 772 deletions

View File

@ -271,11 +271,17 @@ struct NgramDistanceImpl
{
size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr);
/// For !Symmetric version we should not use first_size.
res = distance * 1.f / std::max(Symmetric * first_size + second_size, size_t(1));
if constexpr (Symmetric)
res = distance * 1.f / std::max(first_size + second_size, size_t(1));
else
res = 1.f - distance * 1.f / std::max(second_size, size_t(1));
}
else
{
res = 1.f;
if constexpr (Symmetric)
res = 1.f;
else
res = 0.f;
}
}
@ -333,13 +339,19 @@ struct NgramDistanceImpl
/// For !Symmetric version we should not use haystack_stats_size.
res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1));
if constexpr (Symmetric)
res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
else
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
}
else
{
/// Strings are too big, we are assuming they are not the same. This is done because of limiting number
/// of bigrams added and not allocating too much memory.
res[i] = 1.f;
if constexpr (Symmetric)
res[i] = 1.f;
else
res[i] = 0.f;
}
prev_needle_offset = needle_offsets[i];
@ -399,11 +411,11 @@ struct NgramDistanceImpl
for (size_t j = 0; j < needle_stats_size; ++j)
--common_stats[needle_ngram_storage[j]];
res[i] = distance * 1.f / std::max(needle_stats_size, size_t(1));
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
}
else
{
res[i] = 1.f;
res[i] = 0.f;
}
prev_offset = needle_offsets[i];
@ -446,12 +458,18 @@ struct NgramDistanceImpl
distance,
ngram_storage.get());
/// For !Symmetric version we should not use haystack_stats_size.
res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1));
if constexpr (Symmetric)
res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
else
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
}
else
{
/// if the strings are too big, we say they are completely not the same
res[i] = 1.f;
if constexpr (Symmetric)
res[i] = 1.f;
else
res[i] = 0.f;
}
distance = needle_stats_size;
prev_offset = offsets[i];

View File

@ -108,7 +108,7 @@ For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceC
## ngramSearch(haystack, needle)
Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. Can be useful for fuzzy string search.
Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. The closer to one, the more likely `needle` is in the `haystack`. Can be useful for fuzzy string search.
For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.

View File

@ -97,7 +97,7 @@
## ngramSearch(haystack, needle)
То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Может быть использовано для приближенного поиска.
То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Чем ближе результат к единице, тем вероятнее, что `needle` внутри `haystack`. Может быть использовано для приближенного поиска.
Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.