mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 00:22:29 +00:00
dbms: replace old lower/upper(UTF8) with vectorized ones [#METR-14764]
This commit is contained in:
parent
b2bfa55a37
commit
ea4f4420ba
@ -202,43 +202,8 @@ struct LengthUTF8Impl
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/** Переводит строку в нижний (верхний) регистр, в текущей локали, в однобайтовой кодировке.
|
|
||||||
*/
|
|
||||||
template <int F(int)>
|
|
||||||
struct LowerUpperImpl
|
|
||||||
{
|
|
||||||
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
|
|
||||||
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
|
|
||||||
{
|
|
||||||
res_data.resize(data.size());
|
|
||||||
res_offsets.assign(offsets);
|
|
||||||
array(&*data.begin(), &*data.end(), &*res_data.begin());
|
|
||||||
}
|
|
||||||
|
|
||||||
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
|
|
||||||
ColumnString::Chars_t & res_data)
|
|
||||||
{
|
|
||||||
res_data.resize(data.size());
|
|
||||||
array(&*data.begin(), &*data.end(), &*res_data.begin());
|
|
||||||
}
|
|
||||||
|
|
||||||
static void constant(const std::string & data, std::string & res_data)
|
|
||||||
{
|
|
||||||
res_data.resize(data.size());
|
|
||||||
array(reinterpret_cast<const UInt8 *>(&*data.begin()), reinterpret_cast<const UInt8 *>(&*data.end()),
|
|
||||||
reinterpret_cast<UInt8 *>(&*res_data.begin()));
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
|
|
||||||
{
|
|
||||||
for (; src < src_end; ++src, ++dst)
|
|
||||||
*dst = F(*src);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <char not_case_lower_bound, char not_case_upper_bound>
|
template <char not_case_lower_bound, char not_case_upper_bound>
|
||||||
struct LowerUpperImplVectorized
|
struct LowerUpperImpl
|
||||||
{
|
{
|
||||||
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
|
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
|
||||||
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
|
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
|
||||||
@ -348,9 +313,14 @@ inline void UTF8CyrillicToCase(const UInt8 * & src, const UInt8 * const src_end,
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** Если строка содержит текст в кодировке UTF-8 - перевести его в нижний (верхний) регистр.
|
||||||
|
* Замечание: предполагается, что после перевода символа в другой регистр,
|
||||||
|
* длина его мультибайтовой последовательности в UTF-8 не меняется.
|
||||||
|
* Иначе - поведение не определено.
|
||||||
|
*/
|
||||||
template <char not_case_lower_bound, char not_case_upper_bound,
|
template <char not_case_lower_bound, char not_case_upper_bound,
|
||||||
int to_case(int), void cyrillic_to_case(const UInt8 * &, const UInt8 *, UInt8 * &)>
|
int to_case(int), void cyrillic_to_case(const UInt8 * &, const UInt8 *, UInt8 * &)>
|
||||||
struct LowerUpperUTF8ImplVectorized
|
struct LowerUpperUTF8Impl
|
||||||
{
|
{
|
||||||
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
|
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
|
||||||
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
|
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
|
||||||
@ -487,59 +457,6 @@ private:
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/** Если строка содержит текст в кодировке UTF-8 - перевести его в нижний (верхний) регистр.
|
|
||||||
* Замечание: предполагается, что после перевода символа в другой регистр,
|
|
||||||
* длина его мультибайтовой последовательности в UTF-8 не меняется.
|
|
||||||
* Иначе - поведение не определено.
|
|
||||||
*/
|
|
||||||
template <int F(int)>
|
|
||||||
struct LowerUpperUTF8Impl
|
|
||||||
{
|
|
||||||
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
|
|
||||||
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
|
|
||||||
{
|
|
||||||
res_data.resize(data.size());
|
|
||||||
res_offsets.assign(offsets);
|
|
||||||
array(&*data.begin(), &*data.end(), &*res_data.begin());
|
|
||||||
}
|
|
||||||
|
|
||||||
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
|
|
||||||
ColumnString::Chars_t & res_data)
|
|
||||||
{
|
|
||||||
res_data.resize(data.size());
|
|
||||||
array(&*data.begin(), &*data.end(), &*res_data.begin());
|
|
||||||
}
|
|
||||||
|
|
||||||
static void constant(const std::string & data, std::string & res_data)
|
|
||||||
{
|
|
||||||
res_data.resize(data.size());
|
|
||||||
array(reinterpret_cast<const UInt8 *>(&*data.begin()), reinterpret_cast<const UInt8 *>(&*data.end()),
|
|
||||||
reinterpret_cast<UInt8 *>(&*res_data.begin()));
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
|
|
||||||
{
|
|
||||||
static Poco::UTF8Encoding utf8;
|
|
||||||
|
|
||||||
while (src < src_end)
|
|
||||||
{
|
|
||||||
int chars = utf8.convert(F(utf8.convert(src)), dst, src_end - src);
|
|
||||||
if (chars)
|
|
||||||
{
|
|
||||||
src += chars;
|
|
||||||
dst += chars;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
++src;
|
|
||||||
++dst;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/** Разворачивает строку в байтах.
|
/** Разворачивает строку в байтах.
|
||||||
*/
|
*/
|
||||||
struct ReverseImpl
|
struct ReverseImpl
|
||||||
@ -1676,32 +1593,22 @@ struct NameReverseUTF8 { static constexpr auto name = "reverseUTF8"; };
|
|||||||
struct NameSubstring { static constexpr auto name = "substring"; };
|
struct NameSubstring { static constexpr auto name = "substring"; };
|
||||||
struct NameSubstringUTF8 { static constexpr auto name = "substringUTF8"; };
|
struct NameSubstringUTF8 { static constexpr auto name = "substringUTF8"; };
|
||||||
|
|
||||||
struct NameSSELower { static constexpr auto name = "sse_lower"; };
|
|
||||||
struct NameSSEUpper { static constexpr auto name = "sse_upper"; };
|
|
||||||
struct NameSSELowerUTF8 { static constexpr auto name = "sse_lowerUTF8"; };
|
|
||||||
struct NameSSEUpperUTF8 { static constexpr auto name = "sse_upperUTF8"; };
|
|
||||||
|
|
||||||
typedef FunctionStringOrArrayToT<EmptyImpl<false>, NameEmpty, UInt8> FunctionEmpty;
|
typedef FunctionStringOrArrayToT<EmptyImpl<false>, NameEmpty, UInt8> FunctionEmpty;
|
||||||
typedef FunctionStringOrArrayToT<EmptyImpl<true>, NameNotEmpty, UInt8> FunctionNotEmpty;
|
typedef FunctionStringOrArrayToT<EmptyImpl<true>, NameNotEmpty, UInt8> FunctionNotEmpty;
|
||||||
typedef FunctionStringOrArrayToT<LengthImpl, NameLength, UInt64> FunctionLength;
|
typedef FunctionStringOrArrayToT<LengthImpl, NameLength, UInt64> FunctionLength;
|
||||||
typedef FunctionStringOrArrayToT<LengthUTF8Impl, NameLengthUTF8, UInt64> FunctionLengthUTF8;
|
typedef FunctionStringOrArrayToT<LengthUTF8Impl, NameLengthUTF8, UInt64> FunctionLengthUTF8;
|
||||||
typedef FunctionStringToString<LowerUpperImpl<tolower>, NameLower> FunctionLower;
|
typedef FunctionStringToString<LowerUpperImpl<'A', 'Z'>, NameLower> FunctionLower;
|
||||||
typedef FunctionStringToString<LowerUpperImpl<toupper>, NameUpper> FunctionUpper;
|
typedef FunctionStringToString<LowerUpperImpl<'a', 'z'>, NameUpper> FunctionUpper;
|
||||||
typedef FunctionStringToString<LowerUpperUTF8Impl<Poco::Unicode::toLower>, NameLowerUTF8> FunctionLowerUTF8;
|
typedef FunctionStringToString<
|
||||||
typedef FunctionStringToString<LowerUpperUTF8Impl<Poco::Unicode::toUpper>, NameUpperUTF8> FunctionUpperUTF8;
|
LowerUpperUTF8Impl<'A', 'Z', Poco::Unicode::toLower, UTF8CyrillicToCase<true>>,
|
||||||
|
NameLowerUTF8> FunctionLowerUTF8;
|
||||||
|
typedef FunctionStringToString<
|
||||||
|
LowerUpperUTF8Impl<'a', 'z', Poco::Unicode::toUpper, UTF8CyrillicToCase<false>>,
|
||||||
|
NameUpperUTF8> FunctionUpperUTF8;
|
||||||
typedef FunctionStringToString<ReverseImpl, NameReverse> FunctionReverse;
|
typedef FunctionStringToString<ReverseImpl, NameReverse> FunctionReverse;
|
||||||
typedef FunctionStringToString<ReverseUTF8Impl, NameReverseUTF8> FunctionReverseUTF8;
|
typedef FunctionStringToString<ReverseUTF8Impl, NameReverseUTF8> FunctionReverseUTF8;
|
||||||
typedef FunctionStringNumNumToString<SubstringImpl, NameSubstring> FunctionSubstring;
|
typedef FunctionStringNumNumToString<SubstringImpl, NameSubstring> FunctionSubstring;
|
||||||
typedef FunctionStringNumNumToString<SubstringUTF8Impl, NameSubstringUTF8> FunctionSubstringUTF8;
|
typedef FunctionStringNumNumToString<SubstringUTF8Impl, NameSubstringUTF8> FunctionSubstringUTF8;
|
||||||
|
|
||||||
using FunctionSSELower = FunctionStringToString<LowerUpperImplVectorized<'A', 'Z'>, NameSSELower>;
|
|
||||||
using FunctionSSEUpper = FunctionStringToString<LowerUpperImplVectorized<'a', 'z'>, NameSSEUpper>;
|
|
||||||
using FunctionSSELowerUTF8 = FunctionStringToString<
|
|
||||||
LowerUpperUTF8ImplVectorized<'A', 'Z', Poco::Unicode::toLower, UTF8CyrillicToCase<true>>,
|
|
||||||
NameSSELowerUTF8>;
|
|
||||||
using FunctionSSEUpperUTF8 = FunctionStringToString<
|
|
||||||
LowerUpperUTF8ImplVectorized<'a', 'z', Poco::Unicode::toUpper, UTF8CyrillicToCase<false>>,
|
|
||||||
NameSSEUpperUTF8>;
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -20,10 +20,6 @@ void registerFunctionsString(FunctionFactory & factory)
|
|||||||
factory.registerFunction<FunctionSubstring>();
|
factory.registerFunction<FunctionSubstring>();
|
||||||
factory.registerFunction<FunctionSubstringUTF8>();
|
factory.registerFunction<FunctionSubstringUTF8>();
|
||||||
factory.registerFunction<FunctionAppendTrailingCharIfAbsent>();
|
factory.registerFunction<FunctionAppendTrailingCharIfAbsent>();
|
||||||
factory.registerFunction<FunctionSSELower>();
|
|
||||||
factory.registerFunction<FunctionSSEUpper>();
|
|
||||||
factory.registerFunction<FunctionSSELowerUTF8>();
|
|
||||||
factory.registerFunction<FunctionSSEUpperUTF8>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
29
dbms/tests/queries/0_stateless/00170_lower_upper_utf8.sql
Normal file
29
dbms/tests/queries/0_stateless/00170_lower_upper_utf8.sql
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
select lower('aaaaaaaaaaaaaaa012345789,.!aaaa' as str) = str;
|
||||||
|
select lowerUTF8('aaaaaaaaaaaaaaa012345789,.!aaaa' as str) = str;
|
||||||
|
select lower('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'aaaaaaaaaaaaaaa012345789,.!aaaa';
|
||||||
|
select lowerUTF8('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'aaaaaaaaaaaaaaa012345789,.!aaaa';
|
||||||
|
|
||||||
|
select upper('AAAAAAAAAAAAAAA012345789,.!AAAA' as str) = str;
|
||||||
|
select upperUTF8('AAAAAAAAAAAAAAA012345789,.!AAAA' as str) = str;
|
||||||
|
select upper('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'AAAAAAAAAAAAAAA012345789,.!AAAA';
|
||||||
|
select upperUTF8('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'AAAAAAAAAAAAAAA012345789,.!AAAA';
|
||||||
|
|
||||||
|
select sum(lower(materialize('aaaaaaaaaaaaaaa012345789,.!aaaa') as str) = str) = count() array join range(16384) as n;
|
||||||
|
select sum(lowerUTF8(materialize('aaaaaaaaaaaaaaa012345789,.!aaaa') as str) = str) = count() array join range(16384) as n;
|
||||||
|
select sum(lower(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('aaaaaaaaaaaaaaa012345789,.!aaaa')) = count() array join range(16384) as n;
|
||||||
|
select sum(lowerUTF8(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('aaaaaaaaaaaaaaa012345789,.!aaaa')) = count() array join range(16384) as n;
|
||||||
|
|
||||||
|
select sum(upper(materialize('AAAAAAAAAAAAAAA012345789,.!AAAA') as str) = str) = count() array join range(16384) as n;
|
||||||
|
select sum(upperUTF8(materialize('AAAAAAAAAAAAAAA012345789,.!AAAA') as str) = str) = count() array join range(16384) as n;
|
||||||
|
select sum(upper(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('AAAAAAAAAAAAAAA012345789,.!AAAA')) = count() array join range(16384) as n;
|
||||||
|
select sum(upperUTF8(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('AAAAAAAAAAAAAAA012345789,.!AAAA')) = count() array join range(16384) as n;
|
||||||
|
|
||||||
|
select lower('aaaaАБВГAAAAaaAA') = 'aaaaАБВГaaaaaaaa';
|
||||||
|
select upper('aaaaАБВГAAAAaaAA') = 'AAAAАБВГAAAAAAAA';
|
||||||
|
select lowerUTF8('aaaaАБВГAAAAaaAA') = 'aaaaабвгaaaaaaaa';
|
||||||
|
select upperUTF8('aaaaАБВГAAAAaaAA') = 'AAAAАБВГAAAAAAAA';
|
||||||
|
|
||||||
|
select sum(lower(materialize('aaaaАБВГAAAAaaAA')) = materialize('aaaaАБВГaaaaaaaa')) = count() array join range(16384) as n;
|
||||||
|
select sum(upper(materialize('aaaaАБВГAAAAaaAA')) = materialize('AAAAАБВГAAAAAAAA')) = count() array join range(16384) as n;
|
||||||
|
select sum(lowerUTF8(materialize('aaaaАБВГAAAAaaAA')) = materialize('aaaaабвгaaaaaaaa')) = count() array join range(16384) as n;
|
||||||
|
select sum(upperUTF8(materialize('aaaaАБВГAAAAaaAA')) = materialize('AAAAАБВГAAAAAAAA')) = count() array join range(16384) as n;
|
Loading…
Reference in New Issue
Block a user