mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 15:42:02 +00:00
Implement jaroSimilarity + jaroWinklerSimilarity
This commit is contained in:
parent
7d98fdede7
commit
ee0738e3df
@ -1487,6 +1487,54 @@ Result:
|
||||
└───────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## jaroSimilarity
|
||||
|
||||
Calculates the [Jaro similarity](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_similarity) between two byte strings.
|
||||
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
jaroSimilarity(string1, string2)
|
||||
```
|
||||
|
||||
**Examples**
|
||||
|
||||
``` sql
|
||||
SELECT jaroSimilarity('clickhouse', 'click');
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─jaroSimilarity('clickhouse', 'click')─┐
|
||||
│ 0.8333333333333333 │
|
||||
└───────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## jaroWinklerSimilarity
|
||||
|
||||
Calculates the [Jaro-Winkler similarity](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro%E2%80%93Winkler_similarity) between two byte strings.
|
||||
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
jaroWinklerSimilarity(string1, string2)
|
||||
```
|
||||
|
||||
**Examples**
|
||||
|
||||
``` sql
|
||||
SELECT jaroWinklerSimilarity('clickhouse', 'click');
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─jaroWinklerSimilarity('clickhouse', 'click')─┐
|
||||
│ 0.8999999999999999 │
|
||||
└──────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## initcap
|
||||
|
||||
Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters.
|
||||
|
@ -236,7 +236,7 @@ struct ByteEditDistanceImpl
|
||||
if (haystack_size == 0 || needle_size == 0)
|
||||
return haystack_size + needle_size;
|
||||
|
||||
/// Safety threshold against DoS, since we use two array to calculate the distance.
|
||||
/// Safety threshold against DoS, since we use two arrays to calculate the distance.
|
||||
if (haystack_size > max_string_size || needle_size > max_string_size)
|
||||
throw Exception(
|
||||
ErrorCodes::TOO_LARGE_STRING_SIZE,
|
||||
@ -280,7 +280,7 @@ struct ByteDamerauLevenshteinDistanceImpl
|
||||
static ResultType process(
|
||||
const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size)
|
||||
{
|
||||
/// Safety threshold against DoS, since we use two array to calculate the distance.
|
||||
/// Safety threshold against DoS
|
||||
if (haystack_size > max_string_size || needle_size > max_string_size)
|
||||
throw Exception(
|
||||
ErrorCodes::TOO_LARGE_STRING_SIZE,
|
||||
@ -335,6 +335,118 @@ struct ByteDamerauLevenshteinDistanceImpl
|
||||
}
|
||||
};
|
||||
|
||||
struct ByteJaroSimilarityImpl {
|
||||
|
||||
using ResultType = Float64;
|
||||
|
||||
static ResultType process(
|
||||
const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size)
|
||||
{
|
||||
/// Safety threshold against DoS
|
||||
if (haystack_size > max_string_size || needle_size > max_string_size)
|
||||
throw Exception(
|
||||
ErrorCodes::TOO_LARGE_STRING_SIZE,
|
||||
"The string size is too big for function jaroSimilarity, should be at most {}", max_string_size);
|
||||
|
||||
/// Shortcuts:
|
||||
|
||||
if (haystack_size == 0)
|
||||
return needle_size;
|
||||
|
||||
if (needle_size == 0)
|
||||
return haystack_size;
|
||||
|
||||
if (haystack_size == needle_size && memcmp(haystack, needle, haystack_size) == 0)
|
||||
return 1.0;
|
||||
|
||||
const int s1len = static_cast<int>(haystack_size);
|
||||
const int s2len = static_cast<int>(needle_size);
|
||||
|
||||
/// Window size to search for matches in the other string
|
||||
const int max_range = std::max(0, std::max(s1len, s2len) / 2 - 1);
|
||||
std::vector<int> s1_matching(s1len, -1);
|
||||
std::vector<int> s2_matching(s2len, -1);
|
||||
|
||||
/// Calculate matching characters
|
||||
size_t matching_characters = 0;
|
||||
for (int i = 0; i < s1len; i++)
|
||||
{
|
||||
/// Matching window
|
||||
const int min_index = std::max(i - max_range, 0);
|
||||
const int max_index = std::min(i + max_range + 1, s2len);
|
||||
for (int j = min_index; j < max_index; j++)
|
||||
{
|
||||
if (s2_matching[j] == -1 && haystack[i] == needle[j]) {
|
||||
s1_matching[i] = i;
|
||||
s2_matching[j] = j;
|
||||
matching_characters++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (matching_characters == 0)
|
||||
return 0.0;
|
||||
|
||||
/// Transpositions (one-way only)
|
||||
double transpositions = 0.0;
|
||||
for (size_t i = 0, s1i = 0, s2i = 0; i < matching_characters; i++)
|
||||
{
|
||||
while (s1_matching[s1i] == -1)
|
||||
s1i++;
|
||||
while (s2_matching[s2i] == -1)
|
||||
s2i++;
|
||||
if (haystack[s1i] != needle[s2i])
|
||||
transpositions += 0.5;
|
||||
s1i++;
|
||||
s2i++;
|
||||
}
|
||||
double m = static_cast<double>(matching_characters);
|
||||
double jaro_similarity = 1.0 / 3.0 * ( m / static_cast<double>(s1len)
|
||||
+ m / static_cast<double>(s2len)
|
||||
+ (m - transpositions) / m );
|
||||
return jaro_similarity;
|
||||
}
|
||||
};
|
||||
|
||||
struct ByteJaroWinklerSimilarityImpl {
|
||||
|
||||
using ResultType = Float64;
|
||||
|
||||
static ResultType process(
|
||||
const char * __restrict haystack, size_t haystack_size, const char * __restrict needle, size_t needle_size)
|
||||
{
|
||||
static constexpr int max_prefix_length = 4;
|
||||
static constexpr double scaling_factor = 0.1;
|
||||
static constexpr double boost_threshold = 0.7;
|
||||
|
||||
/// Safety threshold against DoS
|
||||
if (haystack_size > max_string_size || needle_size > max_string_size)
|
||||
throw Exception(
|
||||
ErrorCodes::TOO_LARGE_STRING_SIZE,
|
||||
"The string size is too big for function jaroWinklerSimilarity, should be at most {}", max_string_size);
|
||||
|
||||
const int s1len = static_cast<int>(haystack_size);
|
||||
const int s2len = static_cast<int>(needle_size);
|
||||
|
||||
ResultType jaro_winkler_similarity = ByteJaroSimilarityImpl::process(haystack, haystack_size, needle, needle_size);
|
||||
|
||||
if (jaro_winkler_similarity== -1.0)
|
||||
return -1.0;
|
||||
|
||||
if (jaro_winkler_similarity> boost_threshold)
|
||||
{
|
||||
const int common_length = std::min(max_prefix_length, std::min(s1len, s2len));
|
||||
int common_prefix = 0;
|
||||
while (common_prefix < common_length && haystack[common_prefix] == needle[common_prefix])
|
||||
common_prefix++;
|
||||
|
||||
jaro_winkler_similarity += common_prefix * scaling_factor * (1.0 - jaro_winkler_similarity);
|
||||
}
|
||||
return jaro_winkler_similarity;
|
||||
}
|
||||
};
|
||||
|
||||
struct NameByteHammingDistance
|
||||
{
|
||||
static constexpr auto name = "byteHammingDistance";
|
||||
@ -365,6 +477,18 @@ struct NameJaccardIndexUTF8
|
||||
};
|
||||
using FunctionStringJaccardIndexUTF8 = FunctionsStringSimilarity<FunctionStringDistanceImpl<ByteJaccardIndexImpl<true>>, NameJaccardIndexUTF8>;
|
||||
|
||||
struct NameJaroSimilarity
|
||||
{
|
||||
static constexpr auto name = "jaroSimilarity";
|
||||
};
|
||||
using FunctionJaroSimilarity = FunctionsStringSimilarity<FunctionStringDistanceImpl<ByteJaroSimilarityImpl>, NameJaroSimilarity>;
|
||||
|
||||
struct NameJaroWinklerSimilarity
|
||||
{
|
||||
static constexpr auto name = "jaroWinklerSimilarity";
|
||||
};
|
||||
using FunctionJaroWinklerSimilarity = FunctionsStringSimilarity<FunctionStringDistanceImpl<ByteJaroWinklerSimilarityImpl>, NameJaroWinklerSimilarity>;
|
||||
|
||||
REGISTER_FUNCTION(StringDistance)
|
||||
{
|
||||
factory.registerFunction<FunctionByteHammingDistance>(
|
||||
@ -376,11 +500,17 @@ REGISTER_FUNCTION(StringDistance)
|
||||
factory.registerAlias("levenshteinDistance", NameEditDistance::name);
|
||||
|
||||
factory.registerFunction<FunctionDamerauLevenshteinDistance>(
|
||||
FunctionDocumentation{.description = R"(Calculates the Damerau-Levenshtein distance two between two byte-string.)"});
|
||||
FunctionDocumentation{.description = R"(Calculates the Damerau-Levenshtein distance two between two byte-string.)"});
|
||||
|
||||
factory.registerFunction<FunctionStringJaccardIndex>(
|
||||
FunctionDocumentation{.description = R"(Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two byte strings.)"});
|
||||
FunctionDocumentation{.description = R"(Calculates the Jaccard similarity index between two byte strings.)"});
|
||||
factory.registerFunction<FunctionStringJaccardIndexUTF8>(
|
||||
FunctionDocumentation{.description = R"(Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two UTF8 strings.)"});
|
||||
FunctionDocumentation{.description = R"(Calculates the Jaccard similarity index between two UTF8 strings.)"});
|
||||
|
||||
factory.registerFunction<FunctionJaroSimilarity>(
|
||||
FunctionDocumentation{.description = R"(Calculates the Jaro similarity between two byte-string.)"});
|
||||
|
||||
factory.registerFunction<FunctionJaroWinklerSimilarity>(
|
||||
FunctionDocumentation{.description = R"(Calculates the Jaro-Winkler similarity between two byte-string.)"});
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,8 @@ clickhouse mouse 6
|
||||
clickhouse mouse 6
|
||||
clickhouse mouse 0.4
|
||||
clickhouse mouse 0.4
|
||||
clickhouse mouse 0
|
||||
clickhouse mouse 0
|
||||
-- test aliases
|
||||
clickhouse mouse 10
|
||||
clickhouse mouse 6
|
||||
@ -44,6 +46,20 @@ stringJaccardIndexUTF8 abc ab 0.6666666666666666
|
||||
stringJaccardIndexUTF8 abc abc 1
|
||||
stringJaccardIndexUTF8 abc bc 0.6666666666666666
|
||||
stringJaccardIndexUTF8 clickhouse mouse 0.4
|
||||
jaroSimilarity 0
|
||||
jaroSimilarity abc 3
|
||||
jaroSimilarity abc 3
|
||||
jaroSimilarity abc ab 0.8888888888888888
|
||||
jaroSimilarity abc abc 1
|
||||
jaroSimilarity abc bc 0
|
||||
jaroSimilarity clickhouse mouse 0
|
||||
jaroWinklerSimilarity 0
|
||||
jaroWinklerSimilarity abc 3
|
||||
jaroWinklerSimilarity abc 3
|
||||
jaroWinklerSimilarity abc ab 0.9111111111111111
|
||||
jaroWinklerSimilarity abc abc 1
|
||||
jaroWinklerSimilarity abc bc 0
|
||||
jaroWinklerSimilarity clickhouse mouse 0
|
||||
-- Special UTF-8 tests
|
||||
0.4
|
||||
0
|
||||
|
@ -5,6 +5,8 @@ SELECT 'clickhouse' AS s1, 'mouse' AS s2, editDistance(s1, s2);
|
||||
SELECT 'clickhouse' AS s1, 'mouse' AS s2, damerauLevenshteinDistance(s1, s2);
|
||||
SELECT 'clickhouse' AS s1, 'mouse' AS s2, stringJaccardIndex(s1, s2);
|
||||
SELECT 'clickhouse' AS s1, 'mouse' AS s2, stringJaccardIndexUTF8(s1, s2);
|
||||
SELECT 'clickhouse' AS s1, 'mouse' AS s2, jaroSimilarity(s1, s2);
|
||||
SELECT 'clickhouse' AS s1, 'mouse' AS s2, jaroWinklerSimilarity(s1, s2);
|
||||
|
||||
SELECT '-- test aliases';
|
||||
SELECT 'clickhouse' AS s1, 'mouse' AS s2, mismatches(s1, s2);
|
||||
@ -30,6 +32,8 @@ SELECT 'editDistance', s1, s2, editDistance(s1, s2) FROM t ORDER BY ALL;
|
||||
SELECT 'damerauLevenshteinDistance', s1, s2, damerauLevenshteinDistance(s1, s2) FROM t ORDER BY ALL;
|
||||
SELECT 'stringJaccardIndex', s1, s2, stringJaccardIndex(s1, s2) FROM t ORDER BY ALL;
|
||||
SELECT 'stringJaccardIndexUTF8', s1, s2, stringJaccardIndexUTF8(s1, s2) FROM t ORDER BY ALL;
|
||||
SELECT 'jaroSimilarity', s1, s2, jaroSimilarity(s1, s2) FROM t ORDER BY ALL;
|
||||
SELECT 'jaroWinklerSimilarity', s1, s2, jaroWinklerSimilarity(s1, s2) FROM t ORDER BY ALL;
|
||||
|
||||
SELECT '-- Special UTF-8 tests';
|
||||
-- We do not perform full UTF8 validation, so sometimes it just returns some result
|
||||
|
Loading…
Reference in New Issue
Block a user