mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 23:52:03 +00:00
Merge pull request #5807 from yandex/ngramSearch
Inverting ngramSearch to be more intuitive
This commit is contained in:
commit
a9cfe4ce91
@ -271,11 +271,17 @@ struct NgramDistanceImpl
|
||||
{
|
||||
size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr);
|
||||
/// For !Symmetric version we should not use first_size.
|
||||
res = distance * 1.f / std::max(Symmetric * first_size + second_size, size_t(1));
|
||||
if constexpr (Symmetric)
|
||||
res = distance * 1.f / std::max(first_size + second_size, size_t(1));
|
||||
else
|
||||
res = 1.f - distance * 1.f / std::max(second_size, size_t(1));
|
||||
}
|
||||
else
|
||||
{
|
||||
res = 1.f;
|
||||
if constexpr (Symmetric)
|
||||
res = 1.f;
|
||||
else
|
||||
res = 0.f;
|
||||
}
|
||||
}
|
||||
|
||||
@ -333,13 +339,19 @@ struct NgramDistanceImpl
|
||||
|
||||
|
||||
/// For !Symmetric version we should not use haystack_stats_size.
|
||||
res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1));
|
||||
if constexpr (Symmetric)
|
||||
res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
|
||||
else
|
||||
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
|
||||
}
|
||||
else
|
||||
{
|
||||
/// Strings are too big, we are assuming they are not the same. This is done because of limiting number
|
||||
/// of bigrams added and not allocating too much memory.
|
||||
res[i] = 1.f;
|
||||
if constexpr (Symmetric)
|
||||
res[i] = 1.f;
|
||||
else
|
||||
res[i] = 0.f;
|
||||
}
|
||||
|
||||
prev_needle_offset = needle_offsets[i];
|
||||
@ -399,11 +411,11 @@ struct NgramDistanceImpl
|
||||
for (size_t j = 0; j < needle_stats_size; ++j)
|
||||
--common_stats[needle_ngram_storage[j]];
|
||||
|
||||
res[i] = distance * 1.f / std::max(needle_stats_size, size_t(1));
|
||||
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
|
||||
}
|
||||
else
|
||||
{
|
||||
res[i] = 1.f;
|
||||
res[i] = 0.f;
|
||||
}
|
||||
|
||||
prev_offset = needle_offsets[i];
|
||||
@ -446,12 +458,18 @@ struct NgramDistanceImpl
|
||||
distance,
|
||||
ngram_storage.get());
|
||||
/// For !Symmetric version we should not use haystack_stats_size.
|
||||
res[i] = distance * 1.f / std::max(Symmetric * haystack_stats_size + needle_stats_size, size_t(1));
|
||||
if constexpr (Symmetric)
|
||||
res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
|
||||
else
|
||||
res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
|
||||
}
|
||||
else
|
||||
{
|
||||
/// if the strings are too big, we say they are completely not the same
|
||||
res[i] = 1.f;
|
||||
if constexpr (Symmetric)
|
||||
res[i] = 1.f;
|
||||
else
|
||||
res[i] = 0.f;
|
||||
}
|
||||
distance = needle_stats_size;
|
||||
prev_offset = offsets[i];
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -108,7 +108,7 @@ For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceC
|
||||
|
||||
## ngramSearch(haystack, needle)
|
||||
|
||||
Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. Can be useful for fuzzy string search.
|
||||
Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. The closer to one, the more likely `needle` is in the `haystack`. Can be useful for fuzzy string search.
|
||||
|
||||
For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.
|
||||
|
||||
|
@ -97,7 +97,7 @@
|
||||
|
||||
## ngramSearch(haystack, needle)
|
||||
|
||||
То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Может быть использовано для приближенного поиска.
|
||||
То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Чем ближе результат к единице, тем вероятнее, что `needle` внутри `haystack`. Может быть использовано для приближенного поиска.
|
||||
|
||||
Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user