Merge pull request #65269 from liuneng1994/edit-distance-utf8

Add function editDistanceUTF8
This commit is contained in:
Alexey Katsman 2024-06-18 18:46:13 +00:00 committed by GitHub
commit ddbe83d993
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 135 additions and 47 deletions

View File

@ -2178,6 +2178,32 @@ Result:
Alias: levenshteinDistance
## editDistanceUTF8
Calculates the [edit distance](https://en.wikipedia.org/wiki/Edit_distance) between two UTF8 strings.
**Syntax**
```sql
editDistanceUTF8(string1, string2)
```
**Examples**
``` sql
SELECT editDistanceUTF8('我是谁', '我是我');
```
Result:
``` text
┌─editDistanceUTF8('我是谁', '我是我')──┐
│ 1 │
└─────────────────────────────────────┘
```
Alias: levenshteinDistanceUTF8
## damerauLevenshteinDistance
Calculates the [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) between two byte strings.

View File

@ -113,6 +113,36 @@ struct ByteHammingDistanceImpl
}
};
void parseUTF8String(const char * __restrict data, size_t size, std::function<void(UInt32)> utf8_consumer, std::function<void(unsigned char)> ascii_consumer = nullptr)
{
const char * end = data + size;
while (data < end)
{
size_t len = UTF8::seqLength(*data);
if (len == 1)
{
if (ascii_consumer)
ascii_consumer(static_cast<unsigned char>(*data));
else
utf8_consumer(static_cast<UInt32>(*data));
++data;
}
else
{
auto code_point = UTF8::convertUTF8ToCodePoint(data, end - data);
if (code_point.has_value())
{
utf8_consumer(code_point.value());
data += len;
}
else
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(data, end - data));
}
}
}
}
template <bool is_utf8>
struct ByteJaccardIndexImpl
{
@ -138,57 +168,28 @@ struct ByteJaccardIndexImpl
haystack_set.fill(0);
needle_set.fill(0);
while (haystack < haystack_end)
if constexpr (is_utf8)
{
size_t len = 1;
if constexpr (is_utf8)
len = UTF8::seqLength(*haystack);
if (len == 1)
parseUTF8String(
haystack,
haystack_size,
[&](UInt32 data) { haystack_utf8_set.insert(data); },
[&](unsigned char data) { haystack_set[data] = 1; });
parseUTF8String(
needle, needle_size, [&](UInt32 data) { needle_utf8_set.insert(data); }, [&](unsigned char data) { needle_set[data] = 1; });
}
else
{
while (haystack < haystack_end)
{
haystack_set[static_cast<unsigned char>(*haystack)] = 1;
++haystack;
}
else
{
auto code_point = UTF8::convertUTF8ToCodePoint(haystack, haystack_end - haystack);
if (code_point.has_value())
{
haystack_utf8_set.insert(code_point.value());
haystack += len;
}
else
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(haystack, haystack_end - haystack));
}
}
}
while (needle < needle_end)
{
size_t len = 1;
if constexpr (is_utf8)
len = UTF8::seqLength(*needle);
if (len == 1)
while (needle < needle_end)
{
needle_set[static_cast<unsigned char>(*needle)] = 1;
++needle;
}
else
{
auto code_point = UTF8::convertUTF8ToCodePoint(needle, needle_end - needle);
if (code_point.has_value())
{
needle_utf8_set.insert(code_point.value());
needle += len;
}
else
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(needle, needle_end - needle));
}
}
}
UInt8 intersection = 0;
@ -226,6 +227,7 @@ struct ByteJaccardIndexImpl
static constexpr size_t max_string_size = 1u << 16;
template<bool is_utf8>
struct ByteEditDistanceImpl
{
using ResultType = UInt64;
@ -242,6 +244,16 @@ struct ByteEditDistanceImpl
ErrorCodes::TOO_LARGE_STRING_SIZE,
"The string size is too big for function editDistance, should be at most {}", max_string_size);
PaddedPODArray<UInt32> haystack_utf8;
PaddedPODArray<UInt32> needle_utf8;
if constexpr (is_utf8)
{
parseUTF8String(haystack, haystack_size, [&](UInt32 data) { haystack_utf8.push_back(data); });
parseUTF8String(needle, needle_size, [&](UInt32 data) { needle_utf8.push_back(data); });
haystack_size = haystack_utf8.size();
needle_size = needle_utf8.size();
}
PaddedPODArray<ResultType> distances0(haystack_size + 1, 0);
PaddedPODArray<ResultType> distances1(haystack_size + 1, 0);
@ -261,9 +273,16 @@ struct ByteEditDistanceImpl
insertion = distances1[pos_haystack] + 1;
substitution = distances0[pos_haystack];
if (*(needle + pos_needle) != *(haystack + pos_haystack))
substitution += 1;
if constexpr (is_utf8)
{
if (needle_utf8[pos_needle] != haystack_utf8[pos_haystack])
substitution += 1;
}
else
{
if (*(needle + pos_needle) != *(haystack + pos_haystack))
substitution += 1;
}
distances1[pos_haystack + 1] = std::min(deletion, std::min(substitution, insertion));
}
distances0.swap(distances1);
@ -457,7 +476,12 @@ struct NameEditDistance
{
static constexpr auto name = "editDistance";
};
using FunctionEditDistance = FunctionsStringSimilarity<FunctionStringDistanceImpl<ByteEditDistanceImpl>, NameEditDistance>;
using FunctionEditDistance = FunctionsStringSimilarity<FunctionStringDistanceImpl<ByteEditDistanceImpl<false>>, NameEditDistance>;
struct NameEditDistanceUTF8
{
static constexpr auto name = "editDistanceUTF8";
};
using FunctionEditDistanceUTF8 = FunctionsStringSimilarity<FunctionStringDistanceImpl<ByteEditDistanceImpl<true>>, NameEditDistanceUTF8>;
struct NameDamerauLevenshteinDistance
{
@ -499,6 +523,10 @@ REGISTER_FUNCTION(StringDistance)
FunctionDocumentation{.description = R"(Calculates the edit distance between two byte-strings.)"});
factory.registerAlias("levenshteinDistance", NameEditDistance::name);
factory.registerFunction<FunctionEditDistanceUTF8>(
FunctionDocumentation{.description = R"(Calculates the edit distance between two UTF8 strings.)"});
factory.registerAlias("levenshteinDistanceUTF8", NameEditDistanceUTF8::name);
factory.registerFunction<FunctionDamerauLevenshteinDistance>(
FunctionDocumentation{.description = R"(Calculates the Damerau-Levenshtein distance two between two byte-string.)"});

View File

@ -13,53 +13,84 @@ clickhouse mouse 6
-- non-const arguments
byteHammingDistance 0
byteHammingDistance abc 3
byteHammingDistance Jerry 我是谁 9
byteHammingDistance abc 3
byteHammingDistance abc ab 1
byteHammingDistance abc abc 0
byteHammingDistance abc bc 3
byteHammingDistance clickhouse mouse 10
byteHammingDistance 我是谁 Tom 9
byteHammingDistance 我是谁 我是我 3
editDistance 0
editDistance abc 3
editDistance Jerry 我是谁 9
editDistance abc 3
editDistance abc ab 1
editDistance abc abc 0
editDistance abc bc 1
editDistance clickhouse mouse 6
editDistance 我是谁 Tom 9
editDistance 我是谁 我是我 3
editDistanceUTF8 0
editDistanceUTF8 abc 3
editDistanceUTF8 Jerry 我是谁 5
editDistanceUTF8 abc 3
editDistanceUTF8 abc ab 1
editDistanceUTF8 abc abc 0
editDistanceUTF8 abc bc 1
editDistanceUTF8 clickhouse mouse 6
editDistanceUTF8 我是谁 Tom 3
editDistanceUTF8 我是谁 我是我 1
damerauLevenshteinDistance 0
damerauLevenshteinDistance abc 3
damerauLevenshteinDistance Jerry 我是谁 9
damerauLevenshteinDistance abc 3
damerauLevenshteinDistance abc ab 1
damerauLevenshteinDistance abc abc 0
damerauLevenshteinDistance abc bc 1
damerauLevenshteinDistance clickhouse mouse 6
damerauLevenshteinDistance 我是谁 Tom 9
damerauLevenshteinDistance 我是谁 我是我 3
stringJaccardIndex 0
stringJaccardIndex abc 0
stringJaccardIndex Jerry 我是谁 0
stringJaccardIndex abc 0
stringJaccardIndex abc ab 0.6666666666666666
stringJaccardIndex abc abc 1
stringJaccardIndex abc bc 0.6666666666666666
stringJaccardIndex clickhouse mouse 0.4
stringJaccardIndex 我是谁 Tom 0
stringJaccardIndex 我是谁 我是我 0.625
stringJaccardIndexUTF8 0
stringJaccardIndexUTF8 abc 0
stringJaccardIndexUTF8 Jerry 我是谁 0
stringJaccardIndexUTF8 abc 0
stringJaccardIndexUTF8 abc ab 0.6666666666666666
stringJaccardIndexUTF8 abc abc 1
stringJaccardIndexUTF8 abc bc 0.6666666666666666
stringJaccardIndexUTF8 clickhouse mouse 0.4
stringJaccardIndexUTF8 我是谁 Tom 0
stringJaccardIndexUTF8 我是谁 我是我 0.6666666666666666
jaroSimilarity 0
jaroSimilarity abc 3
jaroSimilarity Jerry 我是谁 0
jaroSimilarity abc 3
jaroSimilarity abc ab 0.8888888888888888
jaroSimilarity abc abc 1
jaroSimilarity abc bc 0
jaroSimilarity clickhouse mouse 0
jaroSimilarity 我是谁 Tom 0
jaroSimilarity 我是谁 我是我 0.7777777777777777
jaroWinklerSimilarity 0
jaroWinklerSimilarity abc 3
jaroWinklerSimilarity Jerry 我是谁 0
jaroWinklerSimilarity abc 3
jaroWinklerSimilarity abc ab 0.9111111111111111
jaroWinklerSimilarity abc abc 1
jaroWinklerSimilarity abc bc 0
jaroWinklerSimilarity clickhouse mouse 0
jaroWinklerSimilarity 我是谁 Tom 0
jaroWinklerSimilarity 我是谁 我是我 0.8666666666666666
-- Special UTF-8 tests
0.4
0

View File

@ -26,11 +26,12 @@ CREATE TABLE t
) ENGINE = MergeTree ORDER BY s1;
-- actual test cases
INSERT INTO t VALUES ('', '') ('abc', '') ('', 'abc') ('abc', 'abc') ('abc', 'ab') ('abc', 'bc') ('clickhouse', 'mouse');
INSERT INTO t VALUES ('', '') ('abc', '') ('', 'abc') ('abc', 'abc') ('abc', 'ab') ('abc', 'bc') ('clickhouse', 'mouse') ('我是谁', 'Tom') ('Jerry', '我是谁') ('我是谁', '我是我');
SELECT '-- non-const arguments';
SELECT 'byteHammingDistance', s1, s2, byteHammingDistance(s1, s2) FROM t ORDER BY ALL;
SELECT 'editDistance', s1, s2, editDistance(s1, s2) FROM t ORDER BY ALL;
SELECT 'editDistanceUTF8', s1, s2, editDistanceUTF8(s1, s2) FROM t ORDER BY ALL;
SELECT 'damerauLevenshteinDistance', s1, s2, damerauLevenshteinDistance(s1, s2) FROM t ORDER BY ALL;
SELECT 'stringJaccardIndex', s1, s2, stringJaccardIndex(s1, s2) FROM t ORDER BY ALL;
SELECT 'stringJaccardIndexUTF8', s1, s2, stringJaccardIndexUTF8(s1, s2) FROM t ORDER BY ALL;

View File

@ -1541,6 +1541,7 @@ dumpColumnStructure
durations
ecto
editDistance
editDistanceUTF
embeddings
emptyArray
emptyArrayDate
@ -1900,6 +1901,7 @@ lessOrEquals
lessorequals
levenshtein
levenshteinDistance
levenshteinDistanceUTF
lexicographically
lgamma
libFuzzer