mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 17:12:03 +00:00
Add initcapUtf8: impl + tests
This commit is contained in:
parent
d6dacd3ccf
commit
1a40e30797
@ -1256,4 +1256,12 @@ Result:
|
||||
|
||||
## initcap
|
||||
|
||||
Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters.
|
||||
Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters.
|
||||
|
||||
## initcapUTF8
|
||||
|
||||
Like [initcap](#initcap), assuming that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.
|
||||
|
||||
Does not detect the language, e.g. for Turkish the result might not be exactly correct (i/İ vs. i/I).
|
||||
|
||||
If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point.
|
@ -1116,4 +1116,11 @@ Do Nothing for 2 Minutes 2:00
|
||||
|
||||
## initcap {#initcap}
|
||||
|
||||
Переводит первую букву каждого слова в строке в верхний регистр, а остальные — в нижний. Словами считаются последовательности алфавитно-цифровых символов, разделённые любыми другими символами.
|
||||
Переводит первую букву каждого слова в строке в верхний регистр, а остальные — в нижний. Словами считаются последовательности алфавитно-цифровых символов, разделённые любыми другими символами.
|
||||
|
||||
## initcapUTF8 {#initcapUTF8}
|
||||
|
||||
Как [initcap](#initcap), предпологая, что строка содержит набор байтов, представляющий текст в кодировке UTF-8.
|
||||
Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным.
|
||||
Если длина UTF-8 последовательности байтов различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки результат работы может быть некорректным.
|
||||
Если строка содержит набор байтов, не являющийся UTF-8, то поведение не определено.
|
@ -133,8 +133,6 @@ struct LowerUpperUTF8Impl
|
||||
}
|
||||
else
|
||||
{
|
||||
static const Poco::UTF8Encoding utf8;
|
||||
|
||||
size_t src_sequence_length = UTF8::seqLength(*src);
|
||||
/// In case partial buffer was passed (due to SSE optimization)
|
||||
/// we cannot convert it with current src_end, but we may have more
|
||||
|
114
src/Functions/initcapUTF8.cpp
Normal file
114
src/Functions/initcapUTF8.cpp
Normal file
@ -0,0 +1,114 @@
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionStringToString.h>
|
||||
#include <Functions/LowerUpperUTF8Impl.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Poco/Unicode.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
struct InitcapUTF8Impl
|
||||
{
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
if (data.empty())
|
||||
return;
|
||||
res_data.resize(data.size());
|
||||
res_offsets.assign(offsets);
|
||||
array(data.data(), data.data() + data.size(), offsets, res_data.data());
|
||||
}
|
||||
|
||||
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function initcapUTF8 cannot work with FixedString argument");
|
||||
}
|
||||
|
||||
static void processCodePoint(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst, bool& prev_alphanum)
|
||||
{
|
||||
size_t src_sequence_length = UTF8::seqLength(*src);
|
||||
auto src_code_point = UTF8::convertUTF8ToCodePoint(src, src_end - src);
|
||||
|
||||
if (src_code_point)
|
||||
{
|
||||
bool alpha = Poco::Unicode::isAlpha(*src_code_point);
|
||||
bool alphanum = alpha || Poco::Unicode::isDigit(*src_code_point);
|
||||
|
||||
int dst_code_point = *src_code_point;
|
||||
if (alphanum && !prev_alphanum)
|
||||
{
|
||||
if (alpha)
|
||||
dst_code_point = Poco::Unicode::toUpper(*src_code_point);
|
||||
}
|
||||
else if (alpha)
|
||||
{
|
||||
dst_code_point = Poco::Unicode::toLower(*src_code_point);
|
||||
}
|
||||
prev_alphanum = alphanum;
|
||||
if (dst_code_point > 0)
|
||||
{
|
||||
size_t dst_sequence_length = UTF8::convertCodePointToUTF8(dst_code_point, dst, src_end - src);
|
||||
assert(dst_sequence_length <= 4);
|
||||
|
||||
if (dst_sequence_length == src_sequence_length)
|
||||
{
|
||||
src += dst_sequence_length;
|
||||
dst += dst_sequence_length;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*dst = *src;
|
||||
++dst;
|
||||
++src;
|
||||
prev_alphanum = false;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
static void array(const UInt8 * src, const UInt8 * src_end, const ColumnString::Offsets & offsets, UInt8 * dst)
|
||||
{
|
||||
auto offset_it = offsets.begin();
|
||||
const UInt8 * begin = src;
|
||||
|
||||
/// handle remaining symbols, row by row (to avoid influence of bad UTF8 symbols from one row, to another)
|
||||
while (src < src_end)
|
||||
{
|
||||
const UInt8 * row_end = begin + *offset_it;
|
||||
chassert(row_end >= src);
|
||||
bool prev_alphanum = false;
|
||||
while (src < row_end)
|
||||
processCodePoint(src, row_end, dst, prev_alphanum);
|
||||
++offset_it;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct NameInitcapUTF8
|
||||
{
|
||||
static constexpr auto name = "initcapUTF8";
|
||||
};
|
||||
|
||||
using FunctionInitcapUTF8 = FunctionStringToString<InitcapUTF8Impl, NameInitcapUTF8>;
|
||||
|
||||
}
|
||||
|
||||
REGISTER_FUNCTION(InitcapUTF8)
|
||||
{
|
||||
factory.registerFunction<FunctionInitcapUTF8>();
|
||||
}
|
||||
|
||||
}
|
@ -3,5 +3,11 @@ Hello
|
||||
Hello
|
||||
Hello World
|
||||
Yeah, Well, I`M Gonna Go Build My Own Theme Park
|
||||
Crc32ieee Is Best Function
|
||||
Crc32ieee Is The Best Function
|
||||
42ok
|
||||
|
||||
Hello
|
||||
Yeah, Well, I`M Gonna Go Build My Own Theme Park
|
||||
Привет, Как Дела?
|
||||
Ätsch, Bätsch
|
||||
We Dont Support Cases When Lowercase And Uppercase Characters Occupy Different Number Of Bytes In Utf-8. As An Example, This Happens For ß And ẞ.
|
||||
|
@ -3,5 +3,12 @@ select initcap('Hello');
|
||||
select initcap('hello');
|
||||
select initcap('hello world');
|
||||
select initcap('yeah, well, i`m gonna go build my own theme park');
|
||||
select initcap('CRC32IEEE is best function');
|
||||
select initcap('42oK');
|
||||
select initcap('CRC32IEEE is the best function');
|
||||
select initcap('42oK');
|
||||
|
||||
select initcapUTF8('');
|
||||
select initcapUTF8('Hello');
|
||||
select initcapUTF8('yeah, well, i`m gonna go build my own theme park');
|
||||
select initcapUTF8('привет, как дела?');
|
||||
select initcapUTF8('ätsch, bätsch');
|
||||
select initcapUTF8('We dont support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8. As an example, this happens for ß and ẞ.');
|
@ -1580,6 +1580,7 @@ indexOf
|
||||
infi
|
||||
initialQueryID
|
||||
initializeAggregation
|
||||
initcap
|
||||
injective
|
||||
innogames
|
||||
inodes
|
||||
|
Loading…
Reference in New Issue
Block a user