2023-04-09 08:41:11 +00:00
|
|
|
#include <cctype>
|
|
|
|
|
|
|
|
#include <Functions/FunctionFactory.h>
|
2023-04-12 22:41:26 +00:00
|
|
|
#include <Functions/FunctionStringToString.h>
|
2023-04-10 11:12:17 +00:00
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
2023-04-09 08:41:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2023-04-12 22:41:26 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int ILLEGAL_COLUMN;
|
|
|
|
}
|
|
|
|
/** Soundex algorithm, https://en.wikipedia.org/wiki/Soundex
|
|
|
|
* Implemented similarly as in most SQL dialects:
|
2023-04-10 11:12:17 +00:00
|
|
|
* 1. Save the first letter. Map all occurrences of a, e, i, o, u, y, h, w. to zero(0)
|
|
|
|
* 2. Replace all consonants (include the first letter) with digits as follows:
|
|
|
|
* - b, f, p, v → 1
|
|
|
|
* - c, g, j, k, q, s, x, z → 2
|
|
|
|
* - d, t → 3
|
|
|
|
* - l → 4
|
|
|
|
* - m, n → 5
|
|
|
|
* - r → 6
|
|
|
|
* 3. Replace all adjacent same digits with one digit, and then remove all the zero (0) digits
|
|
|
|
* 4. If the saved letter's digit is the same as the resulting first digit, remove the digit (keep the letter).
|
|
|
|
* 5. Append 3 zeros if result contains less than 3 digits. Remove all except first letter and 3 digits after it.
|
|
|
|
*/
|
|
|
|
|
2023-04-09 08:41:11 +00:00
|
|
|
struct SoundexImpl
|
|
|
|
{
|
2023-04-12 22:41:26 +00:00
|
|
|
static constexpr auto length = 4z;
|
2023-04-09 08:41:11 +00:00
|
|
|
static constexpr auto soundex_map = "01230120022455012623010202";
|
|
|
|
|
2023-04-12 22:41:26 +00:00
|
|
|
static void calculate(const char * value, size_t value_length, char * out)
|
2023-04-09 09:24:54 +00:00
|
|
|
{
|
2023-04-12 22:41:26 +00:00
|
|
|
const char * cur = value;
|
|
|
|
const char * const end = value + value_length;
|
|
|
|
char * const out_end = out + length;
|
2023-04-09 08:41:11 +00:00
|
|
|
|
2023-04-12 22:41:26 +00:00
|
|
|
while (cur < end && !isAlphaASCII(*cur))
|
|
|
|
++cur;
|
2023-04-09 08:41:11 +00:00
|
|
|
|
2023-04-11 05:01:53 +00:00
|
|
|
char prev_code = '0';
|
2023-04-12 22:41:26 +00:00
|
|
|
if (cur < end)
|
2023-04-09 09:24:54 +00:00
|
|
|
{
|
2023-04-12 22:41:26 +00:00
|
|
|
*out = toUpperIfAlphaASCII(*cur);
|
|
|
|
++out;
|
|
|
|
prev_code = soundex_map[toUpperIfAlphaASCII(*cur) - 'A'];
|
|
|
|
++cur;
|
2023-04-09 08:41:11 +00:00
|
|
|
}
|
|
|
|
|
2023-04-12 22:41:26 +00:00
|
|
|
while (cur < end && !isAlphaASCII(*cur))
|
|
|
|
++cur;
|
|
|
|
|
|
|
|
while (cur < end && out < out_end)
|
2023-04-09 08:41:11 +00:00
|
|
|
{
|
2023-04-12 22:41:26 +00:00
|
|
|
char current_code = soundex_map[toUpperIfAlphaASCII(*cur) - 'A'];
|
2023-04-11 01:49:52 +00:00
|
|
|
if ((current_code != '0') && (current_code != prev_code))
|
2023-04-09 08:41:11 +00:00
|
|
|
{
|
2023-04-12 22:41:26 +00:00
|
|
|
*out = current_code;
|
|
|
|
++out;
|
2023-04-09 08:41:11 +00:00
|
|
|
}
|
2023-04-11 01:49:52 +00:00
|
|
|
prev_code = current_code;
|
2023-04-12 22:41:26 +00:00
|
|
|
++cur;
|
|
|
|
|
|
|
|
while (cur < end && !isAlphaASCII(*cur))
|
|
|
|
++cur;
|
2023-04-09 08:41:11 +00:00
|
|
|
}
|
2023-04-12 22:41:26 +00:00
|
|
|
|
|
|
|
while (out < out_end)
|
2023-04-09 09:24:54 +00:00
|
|
|
{
|
2023-04-12 22:41:26 +00:00
|
|
|
*out = '0';
|
|
|
|
++out;
|
2023-04-09 08:41:11 +00:00
|
|
|
}
|
|
|
|
}
|
2023-04-12 22:41:26 +00:00
|
|
|
|
|
|
|
static void vector(
|
|
|
|
const ColumnString::Chars & data,
|
|
|
|
const ColumnString::Offsets & offsets,
|
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
ColumnString::Offsets & res_offsets)
|
|
|
|
{
|
|
|
|
const size_t size = offsets.size();
|
|
|
|
res_data.resize(size * (length + 1));
|
|
|
|
res_offsets.resize(size);
|
|
|
|
|
|
|
|
size_t prev_offset = 0;
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
const char * value = reinterpret_cast<const char *>(&data[prev_offset]);
|
|
|
|
const size_t value_length = offsets[i] - prev_offset - 1;
|
|
|
|
const size_t out_index = i * (length + 1);
|
|
|
|
calculate(value, value_length, reinterpret_cast<char *>(&res_data[out_index]));
|
|
|
|
res_data[out_index + length] = '\0';
|
|
|
|
res_offsets[i] = (out_index + length + 1);
|
|
|
|
prev_offset = offsets[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
|
|
|
{
|
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by soundex function");
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct NameSoundex
|
|
|
|
{
|
|
|
|
static constexpr auto name = "soundex";
|
2023-04-09 08:41:11 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
REGISTER_FUNCTION(Soundex)
|
|
|
|
{
|
2023-04-12 22:41:26 +00:00
|
|
|
factory.registerFunction<FunctionStringToString<SoundexImpl, NameSoundex>>(
|
2023-04-28 10:10:42 +00:00
|
|
|
FunctionDocumentation{.description="Returns Soundex code of a string."}, FunctionFactory::CaseInsensitive);
|
2023-04-09 08:41:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|