diff --git a/dbms/include/DB/Functions/FunctionsString.h b/dbms/include/DB/Functions/FunctionsString.h index 3025aa46942..4ab35f71ef8 100644 --- a/dbms/include/DB/Functions/FunctionsString.h +++ b/dbms/include/DB/Functions/FunctionsString.h @@ -202,46 +202,11 @@ struct LengthUTF8Impl }; -/** Переводит строку в нижний (верхний) регистр, в текущей локали, в однобайтовой кодировке. - */ -template +template struct LowerUpperImpl { static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets, ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets) - { - res_data.resize(data.size()); - res_offsets.assign(offsets); - array(&*data.begin(), &*data.end(), &*res_data.begin()); - } - - static void vector_fixed(const ColumnString::Chars_t & data, size_t n, - ColumnString::Chars_t & res_data) - { - res_data.resize(data.size()); - array(&*data.begin(), &*data.end(), &*res_data.begin()); - } - - static void constant(const std::string & data, std::string & res_data) - { - res_data.resize(data.size()); - array(reinterpret_cast(&*data.begin()), reinterpret_cast(&*data.end()), - reinterpret_cast(&*res_data.begin())); - } - -private: - static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst) - { - for (; src < src_end; ++src, ++dst) - *dst = F(*src); - } -}; - -template -struct LowerUpperImplVectorized -{ - static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets, - ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets) { res_data.resize(data.size()); res_offsets.assign(offsets); @@ -348,9 +313,14 @@ inline void UTF8CyrillicToCase(const UInt8 * & src, const UInt8 * const src_end, } }; +/** Если строка содержит текст в кодировке UTF-8 - перевести его в нижний (верхний) регистр. + * Замечание: предполагается, что после перевода символа в другой регистр, + * длина его мультибайтовой последовательности в UTF-8 не меняется. + * Иначе - поведение не определено. + */ template -struct LowerUpperUTF8ImplVectorized +struct LowerUpperUTF8Impl { static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets, ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets) @@ -487,59 +457,6 @@ private: }; -/** Если строка содержит текст в кодировке UTF-8 - перевести его в нижний (верхний) регистр. - * Замечание: предполагается, что после перевода символа в другой регистр, - * длина его мультибайтовой последовательности в UTF-8 не меняется. - * Иначе - поведение не определено. - */ -template -struct LowerUpperUTF8Impl -{ - static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets, - ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets) - { - res_data.resize(data.size()); - res_offsets.assign(offsets); - array(&*data.begin(), &*data.end(), &*res_data.begin()); - } - - static void vector_fixed(const ColumnString::Chars_t & data, size_t n, - ColumnString::Chars_t & res_data) - { - res_data.resize(data.size()); - array(&*data.begin(), &*data.end(), &*res_data.begin()); - } - - static void constant(const std::string & data, std::string & res_data) - { - res_data.resize(data.size()); - array(reinterpret_cast(&*data.begin()), reinterpret_cast(&*data.end()), - reinterpret_cast(&*res_data.begin())); - } - -private: - static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst) - { - static Poco::UTF8Encoding utf8; - - while (src < src_end) - { - int chars = utf8.convert(F(utf8.convert(src)), dst, src_end - src); - if (chars) - { - src += chars; - dst += chars; - } - else - { - ++src; - ++dst; - } - } - } -}; - - /** Разворачивает строку в байтах. */ struct ReverseImpl @@ -1676,32 +1593,22 @@ struct NameReverseUTF8 { static constexpr auto name = "reverseUTF8"; }; struct NameSubstring { static constexpr auto name = "substring"; }; struct NameSubstringUTF8 { static constexpr auto name = "substringUTF8"; }; -struct NameSSELower { static constexpr auto name = "sse_lower"; }; -struct NameSSEUpper { static constexpr auto name = "sse_upper"; }; -struct NameSSELowerUTF8 { static constexpr auto name = "sse_lowerUTF8"; }; -struct NameSSEUpperUTF8 { static constexpr auto name = "sse_upperUTF8"; }; - typedef FunctionStringOrArrayToT, NameEmpty, UInt8> FunctionEmpty; typedef FunctionStringOrArrayToT, NameNotEmpty, UInt8> FunctionNotEmpty; typedef FunctionStringOrArrayToT FunctionLength; typedef FunctionStringOrArrayToT FunctionLengthUTF8; -typedef FunctionStringToString, NameLower> FunctionLower; -typedef FunctionStringToString, NameUpper> FunctionUpper; -typedef FunctionStringToString, NameLowerUTF8> FunctionLowerUTF8; -typedef FunctionStringToString, NameUpperUTF8> FunctionUpperUTF8; +typedef FunctionStringToString, NameLower> FunctionLower; +typedef FunctionStringToString, NameUpper> FunctionUpper; +typedef FunctionStringToString< + LowerUpperUTF8Impl<'A', 'Z', Poco::Unicode::toLower, UTF8CyrillicToCase>, + NameLowerUTF8> FunctionLowerUTF8; +typedef FunctionStringToString< + LowerUpperUTF8Impl<'a', 'z', Poco::Unicode::toUpper, UTF8CyrillicToCase>, + NameUpperUTF8> FunctionUpperUTF8; typedef FunctionStringToString FunctionReverse; typedef FunctionStringToString FunctionReverseUTF8; typedef FunctionStringNumNumToString FunctionSubstring; typedef FunctionStringNumNumToString FunctionSubstringUTF8; -using FunctionSSELower = FunctionStringToString, NameSSELower>; -using FunctionSSEUpper = FunctionStringToString, NameSSEUpper>; -using FunctionSSELowerUTF8 = FunctionStringToString< - LowerUpperUTF8ImplVectorized<'A', 'Z', Poco::Unicode::toLower, UTF8CyrillicToCase>, - NameSSELowerUTF8>; -using FunctionSSEUpperUTF8 = FunctionStringToString< - LowerUpperUTF8ImplVectorized<'a', 'z', Poco::Unicode::toUpper, UTF8CyrillicToCase>, - NameSSEUpperUTF8>; - } diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index eadd6bc6884..17cda08cbc3 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -20,10 +20,6 @@ void registerFunctionsString(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); } } diff --git a/dbms/tests/queries/0_stateless/00170_lower_upper_utf8.reference b/dbms/tests/queries/0_stateless/00170_lower_upper_utf8.reference new file mode 100644 index 00000000000..f202cb75513 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00170_lower_upper_utf8.reference @@ -0,0 +1,24 @@ +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/dbms/tests/queries/0_stateless/00170_lower_upper_utf8.sql b/dbms/tests/queries/0_stateless/00170_lower_upper_utf8.sql new file mode 100644 index 00000000000..d3f1c6f6230 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00170_lower_upper_utf8.sql @@ -0,0 +1,29 @@ +select lower('aaaaaaaaaaaaaaa012345789,.!aaaa' as str) = str; +select lowerUTF8('aaaaaaaaaaaaaaa012345789,.!aaaa' as str) = str; +select lower('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'aaaaaaaaaaaaaaa012345789,.!aaaa'; +select lowerUTF8('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'aaaaaaaaaaaaaaa012345789,.!aaaa'; + +select upper('AAAAAAAAAAAAAAA012345789,.!AAAA' as str) = str; +select upperUTF8('AAAAAAAAAAAAAAA012345789,.!AAAA' as str) = str; +select upper('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'AAAAAAAAAAAAAAA012345789,.!AAAA'; +select upperUTF8('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'AAAAAAAAAAAAAAA012345789,.!AAAA'; + +select sum(lower(materialize('aaaaaaaaaaaaaaa012345789,.!aaaa') as str) = str) = count() array join range(16384) as n; +select sum(lowerUTF8(materialize('aaaaaaaaaaaaaaa012345789,.!aaaa') as str) = str) = count() array join range(16384) as n; +select sum(lower(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('aaaaaaaaaaaaaaa012345789,.!aaaa')) = count() array join range(16384) as n; +select sum(lowerUTF8(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('aaaaaaaaaaaaaaa012345789,.!aaaa')) = count() array join range(16384) as n; + +select sum(upper(materialize('AAAAAAAAAAAAAAA012345789,.!AAAA') as str) = str) = count() array join range(16384) as n; +select sum(upperUTF8(materialize('AAAAAAAAAAAAAAA012345789,.!AAAA') as str) = str) = count() array join range(16384) as n; +select sum(upper(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('AAAAAAAAAAAAAAA012345789,.!AAAA')) = count() array join range(16384) as n; +select sum(upperUTF8(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('AAAAAAAAAAAAAAA012345789,.!AAAA')) = count() array join range(16384) as n; + +select lower('aaaaАБВГAAAAaaAA') = 'aaaaАБВГaaaaaaaa'; +select upper('aaaaАБВГAAAAaaAA') = 'AAAAАБВГAAAAAAAA'; +select lowerUTF8('aaaaАБВГAAAAaaAA') = 'aaaaабвгaaaaaaaa'; +select upperUTF8('aaaaАБВГAAAAaaAA') = 'AAAAАБВГAAAAAAAA'; + +select sum(lower(materialize('aaaaАБВГAAAAaaAA')) = materialize('aaaaАБВГaaaaaaaa')) = count() array join range(16384) as n; +select sum(upper(materialize('aaaaАБВГAAAAaaAA')) = materialize('AAAAАБВГAAAAAAAA')) = count() array join range(16384) as n; +select sum(lowerUTF8(materialize('aaaaАБВГAAAAaaAA')) = materialize('aaaaабвгaaaaaaaa')) = count() array join range(16384) as n; +select sum(upperUTF8(materialize('aaaaАБВГAAAAaaAA')) = materialize('AAAAАБВГAAAAAAAA')) = count() array join range(16384) as n;