dbms: replace old lower/upper(UTF8) with vectorized ones [#METR-14764]

This commit is contained in:
Andrey Mironov 2015-06-10 15:47:27 +03:00
parent b2bfa55a37
commit ea4f4420ba
4 changed files with 68 additions and 112 deletions

View File

@ -202,46 +202,11 @@ struct LengthUTF8Impl
};
/** Переводит строку в нижний (верхний) регистр, в текущей локали, в однобайтовой кодировке.
*/
template <int F(int)>
template <char not_case_lower_bound, char not_case_upper_bound>
struct LowerUpperImpl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.resize(data.size());
res_offsets.assign(offsets);
array(&*data.begin(), &*data.end(), &*res_data.begin());
}
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
{
res_data.resize(data.size());
array(&*data.begin(), &*data.end(), &*res_data.begin());
}
static void constant(const std::string & data, std::string & res_data)
{
res_data.resize(data.size());
array(reinterpret_cast<const UInt8 *>(&*data.begin()), reinterpret_cast<const UInt8 *>(&*data.end()),
reinterpret_cast<UInt8 *>(&*res_data.begin()));
}
private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
for (; src < src_end; ++src, ++dst)
*dst = F(*src);
}
};
template <char not_case_lower_bound, char not_case_upper_bound>
struct LowerUpperImplVectorized
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.resize(data.size());
res_offsets.assign(offsets);
@ -348,9 +313,14 @@ inline void UTF8CyrillicToCase(const UInt8 * & src, const UInt8 * const src_end,
}
};
/** Если строка содержит текст в кодировке UTF-8 - перевести его в нижний (верхний) регистр.
* Замечание: предполагается, что после перевода символа в другой регистр,
* длина его мультибайтовой последовательности в UTF-8 не меняется.
* Иначе - поведение не определено.
*/
template <char not_case_lower_bound, char not_case_upper_bound,
int to_case(int), void cyrillic_to_case(const UInt8 * &, const UInt8 *, UInt8 * &)>
struct LowerUpperUTF8ImplVectorized
struct LowerUpperUTF8Impl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
@ -487,59 +457,6 @@ private:
};
/** Если строка содержит текст в кодировке UTF-8 - перевести его в нижний (верхний) регистр.
* Замечание: предполагается, что после перевода символа в другой регистр,
* длина его мультибайтовой последовательности в UTF-8 не меняется.
* Иначе - поведение не определено.
*/
template <int F(int)>
struct LowerUpperUTF8Impl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.resize(data.size());
res_offsets.assign(offsets);
array(&*data.begin(), &*data.end(), &*res_data.begin());
}
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
{
res_data.resize(data.size());
array(&*data.begin(), &*data.end(), &*res_data.begin());
}
static void constant(const std::string & data, std::string & res_data)
{
res_data.resize(data.size());
array(reinterpret_cast<const UInt8 *>(&*data.begin()), reinterpret_cast<const UInt8 *>(&*data.end()),
reinterpret_cast<UInt8 *>(&*res_data.begin()));
}
private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
static Poco::UTF8Encoding utf8;
while (src < src_end)
{
int chars = utf8.convert(F(utf8.convert(src)), dst, src_end - src);
if (chars)
{
src += chars;
dst += chars;
}
else
{
++src;
++dst;
}
}
}
};
/** Разворачивает строку в байтах.
*/
struct ReverseImpl
@ -1676,32 +1593,22 @@ struct NameReverseUTF8 { static constexpr auto name = "reverseUTF8"; };
struct NameSubstring { static constexpr auto name = "substring"; };
struct NameSubstringUTF8 { static constexpr auto name = "substringUTF8"; };
struct NameSSELower { static constexpr auto name = "sse_lower"; };
struct NameSSEUpper { static constexpr auto name = "sse_upper"; };
struct NameSSELowerUTF8 { static constexpr auto name = "sse_lowerUTF8"; };
struct NameSSEUpperUTF8 { static constexpr auto name = "sse_upperUTF8"; };
typedef FunctionStringOrArrayToT<EmptyImpl<false>, NameEmpty, UInt8> FunctionEmpty;
typedef FunctionStringOrArrayToT<EmptyImpl<true>, NameNotEmpty, UInt8> FunctionNotEmpty;
typedef FunctionStringOrArrayToT<LengthImpl, NameLength, UInt64> FunctionLength;
typedef FunctionStringOrArrayToT<LengthUTF8Impl, NameLengthUTF8, UInt64> FunctionLengthUTF8;
typedef FunctionStringToString<LowerUpperImpl<tolower>, NameLower> FunctionLower;
typedef FunctionStringToString<LowerUpperImpl<toupper>, NameUpper> FunctionUpper;
typedef FunctionStringToString<LowerUpperUTF8Impl<Poco::Unicode::toLower>, NameLowerUTF8> FunctionLowerUTF8;
typedef FunctionStringToString<LowerUpperUTF8Impl<Poco::Unicode::toUpper>, NameUpperUTF8> FunctionUpperUTF8;
typedef FunctionStringToString<LowerUpperImpl<'A', 'Z'>, NameLower> FunctionLower;
typedef FunctionStringToString<LowerUpperImpl<'a', 'z'>, NameUpper> FunctionUpper;
typedef FunctionStringToString<
LowerUpperUTF8Impl<'A', 'Z', Poco::Unicode::toLower, UTF8CyrillicToCase<true>>,
NameLowerUTF8> FunctionLowerUTF8;
typedef FunctionStringToString<
LowerUpperUTF8Impl<'a', 'z', Poco::Unicode::toUpper, UTF8CyrillicToCase<false>>,
NameUpperUTF8> FunctionUpperUTF8;
typedef FunctionStringToString<ReverseImpl, NameReverse> FunctionReverse;
typedef FunctionStringToString<ReverseUTF8Impl, NameReverseUTF8> FunctionReverseUTF8;
typedef FunctionStringNumNumToString<SubstringImpl, NameSubstring> FunctionSubstring;
typedef FunctionStringNumNumToString<SubstringUTF8Impl, NameSubstringUTF8> FunctionSubstringUTF8;
using FunctionSSELower = FunctionStringToString<LowerUpperImplVectorized<'A', 'Z'>, NameSSELower>;
using FunctionSSEUpper = FunctionStringToString<LowerUpperImplVectorized<'a', 'z'>, NameSSEUpper>;
using FunctionSSELowerUTF8 = FunctionStringToString<
LowerUpperUTF8ImplVectorized<'A', 'Z', Poco::Unicode::toLower, UTF8CyrillicToCase<true>>,
NameSSELowerUTF8>;
using FunctionSSEUpperUTF8 = FunctionStringToString<
LowerUpperUTF8ImplVectorized<'a', 'z', Poco::Unicode::toUpper, UTF8CyrillicToCase<false>>,
NameSSEUpperUTF8>;
}

View File

@ -20,10 +20,6 @@ void registerFunctionsString(FunctionFactory & factory)
factory.registerFunction<FunctionSubstring>();
factory.registerFunction<FunctionSubstringUTF8>();
factory.registerFunction<FunctionAppendTrailingCharIfAbsent>();
factory.registerFunction<FunctionSSELower>();
factory.registerFunction<FunctionSSEUpper>();
factory.registerFunction<FunctionSSELowerUTF8>();
factory.registerFunction<FunctionSSEUpperUTF8>();
}
}

View File

@ -0,0 +1,24 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

View File

@ -0,0 +1,29 @@
select lower('aaaaaaaaaaaaaaa012345789,.!aaaa' as str) = str;
select lowerUTF8('aaaaaaaaaaaaaaa012345789,.!aaaa' as str) = str;
select lower('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'aaaaaaaaaaaaaaa012345789,.!aaaa';
select lowerUTF8('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'aaaaaaaaaaaaaaa012345789,.!aaaa';
select upper('AAAAAAAAAAAAAAA012345789,.!AAAA' as str) = str;
select upperUTF8('AAAAAAAAAAAAAAA012345789,.!AAAA' as str) = str;
select upper('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'AAAAAAAAAAAAAAA012345789,.!AAAA';
select upperUTF8('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'AAAAAAAAAAAAAAA012345789,.!AAAA';
select sum(lower(materialize('aaaaaaaaaaaaaaa012345789,.!aaaa') as str) = str) = count() array join range(16384) as n;
select sum(lowerUTF8(materialize('aaaaaaaaaaaaaaa012345789,.!aaaa') as str) = str) = count() array join range(16384) as n;
select sum(lower(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('aaaaaaaaaaaaaaa012345789,.!aaaa')) = count() array join range(16384) as n;
select sum(lowerUTF8(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('aaaaaaaaaaaaaaa012345789,.!aaaa')) = count() array join range(16384) as n;
select sum(upper(materialize('AAAAAAAAAAAAAAA012345789,.!AAAA') as str) = str) = count() array join range(16384) as n;
select sum(upperUTF8(materialize('AAAAAAAAAAAAAAA012345789,.!AAAA') as str) = str) = count() array join range(16384) as n;
select sum(upper(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('AAAAAAAAAAAAAAA012345789,.!AAAA')) = count() array join range(16384) as n;
select sum(upperUTF8(materialize('AaAaAaAaAaAaAaA012345789,.!aAaA')) = materialize('AAAAAAAAAAAAAAA012345789,.!AAAA')) = count() array join range(16384) as n;
select lower('aaaaАБВГAAAAaaAA') = 'aaaaАБВГaaaaaaaa';
select upper('aaaaАБВГAAAAaaAA') = 'AAAAАБВГAAAAAAAA';
select lowerUTF8('aaaaАБВГAAAAaaAA') = 'aaaaабвгaaaaaaaa';
select upperUTF8('aaaaАБВГAAAAaaAA') = 'AAAAАБВГAAAAAAAA';
select sum(lower(materialize('aaaaАБВГAAAAaaAA')) = materialize('aaaaАБВГaaaaaaaa')) = count() array join range(16384) as n;
select sum(upper(materialize('aaaaАБВГAAAAaaAA')) = materialize('AAAAАБВГAAAAAAAA')) = count() array join range(16384) as n;
select sum(lowerUTF8(materialize('aaaaАБВГAAAAaaAA')) = materialize('aaaaабвгaaaaaaaa')) = count() array join range(16384) as n;
select sum(upperUTF8(materialize('aaaaАБВГAAAAaaAA')) = materialize('AAAAАБВГAAAAAAAA')) = count() array join range(16384) as n;