diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 47e16b67643..4df987b5e2a 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1371,6 +1371,86 @@ Result: └──────────────────┘ ``` +## byteHammingDistance + +Calculates the [hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) between two byte strings. + +**Syntax** + +```sql +byteHammingDistance(string1, string2) +``` + +**Examples** + +``` sql +SELECT byteHammingDistance('karolin', 'kathrin'); +``` + +Result: + +``` text +┌─byteHammingDistance('karolin', 'kathrin')─┐ +│ 3 │ +└───────────────────────────────────────────┘ +``` + +Alias: mismatches + +## stringJaccardIndex + +Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two byte strings. + +**Syntax** + +```sql +stringJaccardIndex(string1, string2) +``` + +**Examples** + +``` sql +SELECT stringJaccardIndex('clickhouse', 'mouse'); +``` + +Result: + +``` text +┌─stringJaccardIndex('clickhouse', 'mouse')─┐ +│ 0.4 │ +└───────────────────────────────────────────┘ +``` + +## stringJaccardIndexUTF8 + +Like [stringJaccardIndex](#stringJaccardIndex) but for UTF8-encoded strings. + +## editDistance + +Calculates the [edit distance](https://en.wikipedia.org/wiki/Edit_distance) between two byte strings. + +**Syntax** + +```sql +editDistance(string1, string2) +``` + +**Examples** + +``` sql +SELECT editDistance('clickhouse', 'mouse'); +``` + +Result: + +``` text +┌─editDistance('clickhouse', 'mouse')─┐ +│ 6 │ +└─────────────────────────────────────┘ +``` + +Alias: levenshteinDistance + ## initcap Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters. diff --git a/tests/queries/0_stateless/02884_string_distance_function.sql b/tests/queries/0_stateless/02884_string_distance_function.sql index 8126cfb5bd9..e3d9051ce5b 100644 --- a/tests/queries/0_stateless/02884_string_distance_function.sql +++ b/tests/queries/0_stateless/02884_string_distance_function.sql @@ -36,12 +36,12 @@ SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xFF\xFF\xFF\xF SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\x41\xE2\x82\xAC')); SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x9F\x99\x82')); SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xFF')); -SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC2\x01')); -- { serverError 36 } -SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC1\x81')); -- { serverError 36 } -SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x80\x80\x41')); -- { serverError 36 } -SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC0\x80')); -- { serverError 36 } -SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xD8\x00 ')); -- { serverError 36 } -SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xDC\x00')); -- { serverError 36 } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC2\x01')); -- { serverError BAD_ARGUMENTS } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC1\x81')); -- { serverError BAD_ARGUMENTS } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x80\x80\x41')); -- { serverError BAD_ARGUMENTS } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC0\x80')); -- { serverError BAD_ARGUMENTS } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xD8\x00 ')); -- { serverError BAD_ARGUMENTS } +SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xDC\x00')); -- { serverError BAD_ARGUMENTS } SELECT stringJaccardIndexUTF8('😃🌍', '🙃😃🌑'), stringJaccardIndex('😃🌍', '🙃😃🌑');