2023-04-09 08:41:11 +00:00
|
|
|
#include <cctype>
|
|
|
|
|
|
|
|
#include <Functions/FunctionFactory.h>
|
|
|
|
#include <Functions/FunctionsHashing.h>
|
2023-04-10 11:12:17 +00:00
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
2023-04-09 08:41:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2023-04-10 11:12:17 +00:00
|
|
|
/** The implementation of this algorithm is the same as most SQL language implementations.
|
|
|
|
|
|
|
|
* Soundex wiki: https://en.wikipedia.org/wiki/Soundex
|
|
|
|
|
|
|
|
* The details are as follows:
|
|
|
|
|
|
|
|
* 1. Save the first letter. Map all occurrences of a, e, i, o, u, y, h, w. to zero(0)
|
|
|
|
* 2. Replace all consonants (include the first letter) with digits as follows:
|
|
|
|
*
|
|
|
|
* - b, f, p, v → 1
|
|
|
|
* - c, g, j, k, q, s, x, z → 2
|
|
|
|
* - d, t → 3
|
|
|
|
* - l → 4
|
|
|
|
* - m, n → 5
|
|
|
|
* - r → 6
|
|
|
|
*
|
|
|
|
* 3. Replace all adjacent same digits with one digit, and then remove all the zero (0) digits
|
|
|
|
* 4. If the saved letter's digit is the same as the resulting first digit, remove the digit (keep the letter).
|
|
|
|
* 5. Append 3 zeros if result contains less than 3 digits. Remove all except first letter and 3 digits after it.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2023-04-09 08:41:11 +00:00
|
|
|
struct SoundexImpl
|
|
|
|
{
|
|
|
|
static constexpr auto name = "soundex";
|
2023-04-10 09:22:15 +00:00
|
|
|
enum
|
|
|
|
{
|
|
|
|
length = 4
|
|
|
|
};
|
2023-04-09 08:41:11 +00:00
|
|
|
static constexpr auto soundex_map = "01230120022455012623010202";
|
|
|
|
|
2023-04-11 01:49:06 +00:00
|
|
|
static void skipNonAlphaASCII(const char *& start, const char * end)
|
2023-04-09 09:24:54 +00:00
|
|
|
{
|
2023-04-11 01:49:06 +00:00
|
|
|
while (start < end && !isAlphaASCII(*start))
|
|
|
|
++start;
|
|
|
|
}
|
|
|
|
static char getScode(char c)
|
|
|
|
{
|
|
|
|
return soundex_map[c - 'A'];
|
2023-04-09 08:41:11 +00:00
|
|
|
}
|
|
|
|
|
2023-04-09 09:24:54 +00:00
|
|
|
static void apply(const char * begin, const size_t size, unsigned char * out_char_data)
|
2023-04-09 08:41:11 +00:00
|
|
|
{
|
2023-04-09 09:24:54 +00:00
|
|
|
const char * in_cur = begin;
|
|
|
|
const char * in_end = begin + size;
|
2023-04-09 08:41:11 +00:00
|
|
|
unsigned char * out_end = out_char_data + length;
|
|
|
|
|
2023-04-11 01:49:16 +00:00
|
|
|
skipNonAlphaASCII(in_cur, in_end);
|
2023-04-09 09:24:54 +00:00
|
|
|
if (in_cur < in_end)
|
|
|
|
{
|
2023-04-10 11:12:17 +00:00
|
|
|
*out_char_data++ = toUpperIfAlphaASCII(*in_cur);
|
2023-04-09 08:41:11 +00:00
|
|
|
}
|
2023-04-11 01:49:34 +00:00
|
|
|
char prev_code = getScode(toUpperIfAlphaASCII(*in_cur));
|
2023-04-09 08:41:11 +00:00
|
|
|
|
2023-04-11 01:49:42 +00:00
|
|
|
char current_code;
|
2023-04-09 08:41:11 +00:00
|
|
|
in_cur++;
|
2023-04-10 09:06:31 +00:00
|
|
|
while (in_cur < in_end && out_char_data < out_end && (ch = getScode(in_cur, in_end)) != 0)
|
2023-04-09 08:41:11 +00:00
|
|
|
{
|
2023-04-09 09:24:54 +00:00
|
|
|
if (in_cur == in_end)
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
2023-04-09 08:41:11 +00:00
|
|
|
in_cur++;
|
|
|
|
if ((ch != '0') && (ch != last_ch))
|
|
|
|
{
|
2023-04-09 09:24:54 +00:00
|
|
|
*out_char_data++ = ch;
|
2023-04-09 08:41:11 +00:00
|
|
|
}
|
|
|
|
last_ch = ch;
|
|
|
|
}
|
2023-04-09 09:24:54 +00:00
|
|
|
while (out_char_data < out_end)
|
|
|
|
{
|
|
|
|
*out_char_data++ = '0';
|
2023-04-09 08:41:11 +00:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
REGISTER_FUNCTION(Soundex)
|
|
|
|
{
|
2023-04-10 09:22:15 +00:00
|
|
|
factory.registerFunction<FunctionStringHashFixedString<SoundexImpl>>(
|
|
|
|
Documentation{"Returns soundex code of a string."}, FunctionFactory::CaseInsensitive);
|
2023-04-09 08:41:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|