Move documentation of string similarity functions to better location

This commit is contained in:
Robert Schulze 2023-11-06 09:59:12 +00:00
parent 3b775dee53
commit ae1dcb5254
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
2 changed files with 86 additions and 6 deletions

View File

@ -1371,6 +1371,86 @@ Result:
└──────────────────┘
```
## byteHammingDistance
Calculates the [hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) between two byte strings.
**Syntax**
```sql
byteHammingDistance(string1, string2)
```
**Examples**
``` sql
SELECT byteHammingDistance('karolin', 'kathrin');
```
Result:
``` text
┌─byteHammingDistance('karolin', 'kathrin')─┐
│ 3 │
└───────────────────────────────────────────┘
```
Alias: mismatches
## stringJaccardIndex
Calculates the [Jaccard similarity index](https://en.wikipedia.org/wiki/Jaccard_index) between two byte strings.
**Syntax**
```sql
stringJaccardIndex(string1, string2)
```
**Examples**
``` sql
SELECT stringJaccardIndex('clickhouse', 'mouse');
```
Result:
``` text
┌─stringJaccardIndex('clickhouse', 'mouse')─┐
│ 0.4 │
└───────────────────────────────────────────┘
```
## stringJaccardIndexUTF8
Like [stringJaccardIndex](#stringJaccardIndex) but for UTF8-encoded strings.
## editDistance
Calculates the [edit distance](https://en.wikipedia.org/wiki/Edit_distance) between two byte strings.
**Syntax**
```sql
editDistance(string1, string2)
```
**Examples**
``` sql
SELECT editDistance('clickhouse', 'mouse');
```
Result:
``` text
┌─editDistance('clickhouse', 'mouse')─┐
│ 6 │
└─────────────────────────────────────┘
```
Alias: levenshteinDistance
## initcap
Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters.

View File

@ -36,12 +36,12 @@ SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xFF\xFF\xFF\xF
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\x41\xE2\x82\xAC'));
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x9F\x99\x82'));
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xFF'));
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC2\x01')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC1\x81')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x80\x80\x41')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC0\x80')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xD8\x00 ')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xDC\x00')); -- { serverError 36 }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC2\x01')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC1\x81')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xF0\x80\x80\x41')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xC0\x80')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xD8\x00 ')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8(materialize('hello'), materialize('\xDC\x00')); -- { serverError BAD_ARGUMENTS }
SELECT stringJaccardIndexUTF8('😃🌍', '🙃😃🌑'), stringJaccardIndex('😃🌍', '🙃😃🌑');