ClickHouse/src/Functions/soundex.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

79 lines
1.8 KiB
C++
Raw Normal View History

2023-04-09 08:41:11 +00:00
#include <cctype>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsHashing.h>
namespace DB
{
struct SoundexImpl
{
static constexpr auto name = "soundex";
2023-04-10 09:22:15 +00:00
enum
{
length = 4
};
2023-04-09 09:24:54 +00:00
/* ABCDEFGHIJKLMNOPQRSTUVWXYZ */
/* :::::::::::::::::::::::::: */
2023-04-09 08:41:11 +00:00
static constexpr auto soundex_map = "01230120022455012623010202";
2023-04-10 09:22:15 +00:00
static char getScode(const char *& ptr, const char * in_end)
2023-04-09 09:24:54 +00:00
{
2023-04-10 09:06:31 +00:00
while (ptr < in_end && !std::isalpha(*ptr))
2023-04-09 09:24:54 +00:00
{
2023-04-10 09:06:31 +00:00
(ptr)++;
2023-04-09 08:41:11 +00:00
}
2023-04-10 09:06:31 +00:00
if (ptr == in_end)
2023-04-09 09:24:54 +00:00
return 0;
2023-04-10 09:06:31 +00:00
return soundex_map[std::toupper(*ptr) - 'A'];
2023-04-09 08:41:11 +00:00
}
2023-04-09 09:24:54 +00:00
static void apply(const char * begin, const size_t size, unsigned char * out_char_data)
2023-04-09 08:41:11 +00:00
{
2023-04-09 09:24:54 +00:00
const char * in_cur = begin;
const char * in_end = begin + size;
2023-04-09 08:41:11 +00:00
unsigned char * out_end = out_char_data + length;
2023-04-09 09:24:54 +00:00
while (in_cur < in_end && !std::isalpha(*in_cur))
{
2023-04-09 08:41:11 +00:00
in_cur++;
}
2023-04-09 09:24:54 +00:00
if (in_cur < in_end)
{
*out_char_data++ = std::toupper(*in_cur);
2023-04-09 08:41:11 +00:00
}
2023-04-10 09:06:31 +00:00
char last_ch = getScode(in_cur, in_end);
2023-04-09 08:41:11 +00:00
char ch;
in_cur++;
2023-04-10 09:06:31 +00:00
while (in_cur < in_end && out_char_data < out_end && (ch = getScode(in_cur, in_end)) != 0)
2023-04-09 08:41:11 +00:00
{
2023-04-09 09:24:54 +00:00
if (in_cur == in_end)
{
break;
}
2023-04-09 08:41:11 +00:00
in_cur++;
if ((ch != '0') && (ch != last_ch))
{
2023-04-09 09:24:54 +00:00
*out_char_data++ = ch;
2023-04-09 08:41:11 +00:00
}
last_ch = ch;
}
2023-04-09 09:24:54 +00:00
while (out_char_data < out_end)
{
*out_char_data++ = '0';
2023-04-09 08:41:11 +00:00
}
return;
}
};
REGISTER_FUNCTION(Soundex)
{
2023-04-10 09:22:15 +00:00
factory.registerFunction<FunctionStringHashFixedString<SoundexImpl>>(
Documentation{"Returns soundex code of a string."}, FunctionFactory::CaseInsensitive);
2023-04-09 08:41:11 +00:00
}
}