mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-19 16:20:50 +00:00
Merge pull request #65269 from liuneng1994/edit-distance-utf8
Add function editDistanceUTF8
This commit is contained in:
commit
ddbe83d993
@ -2178,6 +2178,32 @@ Result:
|
||||
|
||||
Alias: levenshteinDistance
|
||||
|
||||
## editDistanceUTF8
|
||||
|
||||
Calculates the [edit distance](https://en.wikipedia.org/wiki/Edit_distance) between two UTF8 strings.
|
||||
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
editDistanceUTF8(string1, string2)
|
||||
```
|
||||
|
||||
**Examples**
|
||||
|
||||
``` sql
|
||||
SELECT editDistanceUTF8('我是谁', '我是我');
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─editDistanceUTF8('我是谁', '我是我')──┐
|
||||
│ 1 │
|
||||
└─────────────────────────────────────┘
|
||||
```
|
||||
|
||||
Alias: levenshteinDistanceUTF8
|
||||
|
||||
## damerauLevenshteinDistance
|
||||
|
||||
Calculates the [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) between two byte strings.
|
||||
|
@ -113,6 +113,36 @@ struct ByteHammingDistanceImpl
|
||||
}
|
||||
};
|
||||
|
||||
void parseUTF8String(const char * __restrict data, size_t size, std::function<void(UInt32)> utf8_consumer, std::function<void(unsigned char)> ascii_consumer = nullptr)
|
||||
{
|
||||
const char * end = data + size;
|
||||
while (data < end)
|
||||
{
|
||||
size_t len = UTF8::seqLength(*data);
|
||||
if (len == 1)
|
||||
{
|
||||
if (ascii_consumer)
|
||||
ascii_consumer(static_cast<unsigned char>(*data));
|
||||
else
|
||||
utf8_consumer(static_cast<UInt32>(*data));
|
||||
++data;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto code_point = UTF8::convertUTF8ToCodePoint(data, end - data);
|
||||
if (code_point.has_value())
|
||||
{
|
||||
utf8_consumer(code_point.value());
|
||||
data += len;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(data, end - data));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <bool is_utf8>
|
||||
struct ByteJaccardIndexImpl
|
||||
{
|
||||
@ -138,57 +168,28 @@ struct ByteJaccardIndexImpl
|
||||
haystack_set.fill(0);
|
||||
needle_set.fill(0);
|
||||
|
||||
while (haystack < haystack_end)
|
||||
if constexpr (is_utf8)
|
||||
{
|
||||
size_t len = 1;
|
||||
if constexpr (is_utf8)
|
||||
len = UTF8::seqLength(*haystack);
|
||||
|
||||
if (len == 1)
|
||||
parseUTF8String(
|
||||
haystack,
|
||||
haystack_size,
|
||||
[&](UInt32 data) { haystack_utf8_set.insert(data); },
|
||||
[&](unsigned char data) { haystack_set[data] = 1; });
|
||||
parseUTF8String(
|
||||
needle, needle_size, [&](UInt32 data) { needle_utf8_set.insert(data); }, [&](unsigned char data) { needle_set[data] = 1; });
|
||||
}
|
||||
else
|
||||
{
|
||||
while (haystack < haystack_end)
|
||||
{
|
||||
haystack_set[static_cast<unsigned char>(*haystack)] = 1;
|
||||
++haystack;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto code_point = UTF8::convertUTF8ToCodePoint(haystack, haystack_end - haystack);
|
||||
if (code_point.has_value())
|
||||
{
|
||||
haystack_utf8_set.insert(code_point.value());
|
||||
haystack += len;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(haystack, haystack_end - haystack));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (needle < needle_end)
|
||||
{
|
||||
|
||||
size_t len = 1;
|
||||
if constexpr (is_utf8)
|
||||
len = UTF8::seqLength(*needle);
|
||||
|
||||
if (len == 1)
|
||||
while (needle < needle_end)
|
||||
{
|
||||
needle_set[static_cast<unsigned char>(*needle)] = 1;
|
||||
++needle;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto code_point = UTF8::convertUTF8ToCodePoint(needle, needle_end - needle);
|
||||
if (code_point.has_value())
|
||||
{
|
||||
needle_utf8_set.insert(code_point.value());
|
||||
needle += len;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(needle, needle_end - needle));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
UInt8 intersection = 0;
|
||||
@ -226,6 +227,7 @@ struct ByteJaccardIndexImpl
|
||||
|
||||
static constexpr size_t max_string_size = 1u << 16;
|
||||
|
||||
template<bool is_utf8>
|
||||
struct ByteEditDistanceImpl
|
||||
{
|
||||
using ResultType = UInt64;
|
||||
@ -242,6 +244,16 @@ struct ByteEditDistanceImpl
|
||||
ErrorCodes::TOO_LARGE_STRING_SIZE,
|
||||
"The string size is too big for function editDistance, should be at most {}", max_string_size);
|
||||
|
||||
PaddedPODArray<UInt32> haystack_utf8;
|
||||
PaddedPODArray<UInt32> needle_utf8;
|
||||
if constexpr (is_utf8)
|
||||
{
|
||||
parseUTF8String(haystack, haystack_size, [&](UInt32 data) { haystack_utf8.push_back(data); });
|
||||
parseUTF8String(needle, needle_size, [&](UInt32 data) { needle_utf8.push_back(data); });
|
||||
haystack_size = haystack_utf8.size();
|
||||
needle_size = needle_utf8.size();
|
||||
}
|
||||
|
||||
PaddedPODArray<ResultType> distances0(haystack_size + 1, 0);
|
||||
PaddedPODArray<ResultType> distances1(haystack_size + 1, 0);
|
||||
|
||||
@ -261,9 +273,16 @@ struct ByteEditDistanceImpl
|
||||
insertion = distances1[pos_haystack] + 1;
|
||||
substitution = distances0[pos_haystack];
|
||||
|
||||
if (*(needle + pos_needle) != *(haystack + pos_haystack))
|
||||
substitution += 1;
|
||||
|
||||
if constexpr (is_utf8)
|
||||
{
|
||||
if (needle_utf8[pos_needle] != haystack_utf8[pos_haystack])
|
||||
substitution += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (*(needle + pos_needle) != *(haystack + pos_haystack))
|
||||
substitution += 1;
|
||||
}
|
||||
distances1[pos_haystack + 1] = std::min(deletion, std::min(substitution, insertion));
|
||||
}
|
||||
distances0.swap(distances1);
|
||||
@ -457,7 +476,12 @@ struct NameEditDistance
|
||||
{
|
||||
static constexpr auto name = "editDistance";
|
||||
};
|
||||
using FunctionEditDistance = FunctionsStringSimilarity<FunctionStringDistanceImpl<ByteEditDistanceImpl>, NameEditDistance>;
|
||||
using FunctionEditDistance = FunctionsStringSimilarity<FunctionStringDistanceImpl<ByteEditDistanceImpl<false>>, NameEditDistance>;
|
||||
struct NameEditDistanceUTF8
|
||||
{
|
||||
static constexpr auto name = "editDistanceUTF8";
|
||||
};
|
||||
using FunctionEditDistanceUTF8 = FunctionsStringSimilarity<FunctionStringDistanceImpl<ByteEditDistanceImpl<true>>, NameEditDistanceUTF8>;
|
||||
|
||||
struct NameDamerauLevenshteinDistance
|
||||
{
|
||||
@ -499,6 +523,10 @@ REGISTER_FUNCTION(StringDistance)
|
||||
FunctionDocumentation{.description = R"(Calculates the edit distance between two byte-strings.)"});
|
||||
factory.registerAlias("levenshteinDistance", NameEditDistance::name);
|
||||
|
||||
factory.registerFunction<FunctionEditDistanceUTF8>(
|
||||
FunctionDocumentation{.description = R"(Calculates the edit distance between two UTF8 strings.)"});
|
||||
factory.registerAlias("levenshteinDistanceUTF8", NameEditDistanceUTF8::name);
|
||||
|
||||
factory.registerFunction<FunctionDamerauLevenshteinDistance>(
|
||||
FunctionDocumentation{.description = R"(Calculates the Damerau-Levenshtein distance two between two byte-string.)"});
|
||||
|
||||
|
@ -13,53 +13,84 @@ clickhouse mouse 6
|
||||
-- non-const arguments
|
||||
byteHammingDistance 0
|
||||
byteHammingDistance abc 3
|
||||
byteHammingDistance Jerry 我是谁 9
|
||||
byteHammingDistance abc 3
|
||||
byteHammingDistance abc ab 1
|
||||
byteHammingDistance abc abc 0
|
||||
byteHammingDistance abc bc 3
|
||||
byteHammingDistance clickhouse mouse 10
|
||||
byteHammingDistance 我是谁 Tom 9
|
||||
byteHammingDistance 我是谁 我是我 3
|
||||
editDistance 0
|
||||
editDistance abc 3
|
||||
editDistance Jerry 我是谁 9
|
||||
editDistance abc 3
|
||||
editDistance abc ab 1
|
||||
editDistance abc abc 0
|
||||
editDistance abc bc 1
|
||||
editDistance clickhouse mouse 6
|
||||
editDistance 我是谁 Tom 9
|
||||
editDistance 我是谁 我是我 3
|
||||
editDistanceUTF8 0
|
||||
editDistanceUTF8 abc 3
|
||||
editDistanceUTF8 Jerry 我是谁 5
|
||||
editDistanceUTF8 abc 3
|
||||
editDistanceUTF8 abc ab 1
|
||||
editDistanceUTF8 abc abc 0
|
||||
editDistanceUTF8 abc bc 1
|
||||
editDistanceUTF8 clickhouse mouse 6
|
||||
editDistanceUTF8 我是谁 Tom 3
|
||||
editDistanceUTF8 我是谁 我是我 1
|
||||
damerauLevenshteinDistance 0
|
||||
damerauLevenshteinDistance abc 3
|
||||
damerauLevenshteinDistance Jerry 我是谁 9
|
||||
damerauLevenshteinDistance abc 3
|
||||
damerauLevenshteinDistance abc ab 1
|
||||
damerauLevenshteinDistance abc abc 0
|
||||
damerauLevenshteinDistance abc bc 1
|
||||
damerauLevenshteinDistance clickhouse mouse 6
|
||||
damerauLevenshteinDistance 我是谁 Tom 9
|
||||
damerauLevenshteinDistance 我是谁 我是我 3
|
||||
stringJaccardIndex 0
|
||||
stringJaccardIndex abc 0
|
||||
stringJaccardIndex Jerry 我是谁 0
|
||||
stringJaccardIndex abc 0
|
||||
stringJaccardIndex abc ab 0.6666666666666666
|
||||
stringJaccardIndex abc abc 1
|
||||
stringJaccardIndex abc bc 0.6666666666666666
|
||||
stringJaccardIndex clickhouse mouse 0.4
|
||||
stringJaccardIndex 我是谁 Tom 0
|
||||
stringJaccardIndex 我是谁 我是我 0.625
|
||||
stringJaccardIndexUTF8 0
|
||||
stringJaccardIndexUTF8 abc 0
|
||||
stringJaccardIndexUTF8 Jerry 我是谁 0
|
||||
stringJaccardIndexUTF8 abc 0
|
||||
stringJaccardIndexUTF8 abc ab 0.6666666666666666
|
||||
stringJaccardIndexUTF8 abc abc 1
|
||||
stringJaccardIndexUTF8 abc bc 0.6666666666666666
|
||||
stringJaccardIndexUTF8 clickhouse mouse 0.4
|
||||
stringJaccardIndexUTF8 我是谁 Tom 0
|
||||
stringJaccardIndexUTF8 我是谁 我是我 0.6666666666666666
|
||||
jaroSimilarity 0
|
||||
jaroSimilarity abc 3
|
||||
jaroSimilarity Jerry 我是谁 0
|
||||
jaroSimilarity abc 3
|
||||
jaroSimilarity abc ab 0.8888888888888888
|
||||
jaroSimilarity abc abc 1
|
||||
jaroSimilarity abc bc 0
|
||||
jaroSimilarity clickhouse mouse 0
|
||||
jaroSimilarity 我是谁 Tom 0
|
||||
jaroSimilarity 我是谁 我是我 0.7777777777777777
|
||||
jaroWinklerSimilarity 0
|
||||
jaroWinklerSimilarity abc 3
|
||||
jaroWinklerSimilarity Jerry 我是谁 0
|
||||
jaroWinklerSimilarity abc 3
|
||||
jaroWinklerSimilarity abc ab 0.9111111111111111
|
||||
jaroWinklerSimilarity abc abc 1
|
||||
jaroWinklerSimilarity abc bc 0
|
||||
jaroWinklerSimilarity clickhouse mouse 0
|
||||
jaroWinklerSimilarity 我是谁 Tom 0
|
||||
jaroWinklerSimilarity 我是谁 我是我 0.8666666666666666
|
||||
-- Special UTF-8 tests
|
||||
0.4
|
||||
0
|
||||
|
@ -26,11 +26,12 @@ CREATE TABLE t
|
||||
) ENGINE = MergeTree ORDER BY s1;
|
||||
|
||||
-- actual test cases
|
||||
INSERT INTO t VALUES ('', '') ('abc', '') ('', 'abc') ('abc', 'abc') ('abc', 'ab') ('abc', 'bc') ('clickhouse', 'mouse');
|
||||
INSERT INTO t VALUES ('', '') ('abc', '') ('', 'abc') ('abc', 'abc') ('abc', 'ab') ('abc', 'bc') ('clickhouse', 'mouse') ('我是谁', 'Tom') ('Jerry', '我是谁') ('我是谁', '我是我');
|
||||
|
||||
SELECT '-- non-const arguments';
|
||||
SELECT 'byteHammingDistance', s1, s2, byteHammingDistance(s1, s2) FROM t ORDER BY ALL;
|
||||
SELECT 'editDistance', s1, s2, editDistance(s1, s2) FROM t ORDER BY ALL;
|
||||
SELECT 'editDistanceUTF8', s1, s2, editDistanceUTF8(s1, s2) FROM t ORDER BY ALL;
|
||||
SELECT 'damerauLevenshteinDistance', s1, s2, damerauLevenshteinDistance(s1, s2) FROM t ORDER BY ALL;
|
||||
SELECT 'stringJaccardIndex', s1, s2, stringJaccardIndex(s1, s2) FROM t ORDER BY ALL;
|
||||
SELECT 'stringJaccardIndexUTF8', s1, s2, stringJaccardIndexUTF8(s1, s2) FROM t ORDER BY ALL;
|
||||
|
@ -1541,6 +1541,7 @@ dumpColumnStructure
|
||||
durations
|
||||
ecto
|
||||
editDistance
|
||||
editDistanceUTF
|
||||
embeddings
|
||||
emptyArray
|
||||
emptyArrayDate
|
||||
@ -1900,6 +1901,7 @@ lessOrEquals
|
||||
lessorequals
|
||||
levenshtein
|
||||
levenshteinDistance
|
||||
levenshteinDistanceUTF
|
||||
lexicographically
|
||||
lgamma
|
||||
libFuzzer
|
||||
|
Loading…
Reference in New Issue
Block a user