diff --git a/dbms/src/Functions/trim.cpp b/dbms/src/Functions/trim.cpp index f2e2543cc90..46f69530005 100644 --- a/dbms/src/Functions/trim.cpp +++ b/dbms/src/Functions/trim.cpp @@ -1,10 +1,8 @@ #include #include #include +#include -#ifdef __SSE4_2__ -#include -#endif namespace DB { @@ -60,7 +58,7 @@ public: execute(reinterpret_cast(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length); res_data.resize(res_data.size() + length + 1); - memcpy(&res_data[res_offset], start, length); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length); res_offset += length + 1; res_data[res_offset - 1] = '\0'; @@ -77,59 +75,27 @@ public: private: static void execute(const UInt8 * data, size_t size, const UInt8 *& res_data, size_t & res_size) { - size_t chars_to_trim_left = 0; - size_t chars_to_trim_right = 0; - char whitespace = ' '; -#ifdef __SSE4_2__ - const auto bytes_sse = sizeof(__m128i); - const auto size_sse = size - (size % bytes_sse); - const auto whitespace_mask = _mm_set1_epi8(whitespace); - constexpr auto base_sse_mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY; - auto mask = bytes_sse; -#endif + const char * char_data = reinterpret_cast(data); + const char * char_end = char_data + size; if constexpr (mode::trim_left) { -#ifdef __SSE4_2__ - /// skip whitespace from left in blocks of up to 16 characters - - /// Avoid gcc bug: _mm_cmpistri: error: the third argument must be an 8-bit immediate - enum { left_sse_mode = base_sse_mode | _SIDD_LEAST_SIGNIFICANT }; - while (mask == bytes_sse && chars_to_trim_left < size_sse) - { - const auto chars = _mm_loadu_si128(reinterpret_cast(data + chars_to_trim_left)); - mask = _mm_cmpistri(whitespace_mask, chars, left_sse_mode); - chars_to_trim_left += mask; - } -#endif - /// skip remaining whitespace from left, character by character - while (chars_to_trim_left < size && data[chars_to_trim_left] == whitespace) - ++chars_to_trim_left; + const char * found = find_first_not_symbols<' '>(char_data, char_end); + size_t num_chars = found - char_data; + char_data += num_chars; } if constexpr (mode::trim_right) { - const auto trim_right_size = size - chars_to_trim_left; -#ifdef __SSE4_2__ - /// try to skip whitespace from right in blocks of up to 16 characters - - /// Avoid gcc bug: _mm_cmpistri: error: the third argument must be an 8-bit immediate - enum { right_sse_mode = base_sse_mode | _SIDD_MOST_SIGNIFICANT }; - const auto trim_right_size_sse = trim_right_size - (trim_right_size % bytes_sse); - while (mask == bytes_sse && chars_to_trim_right < trim_right_size_sse) - { - const auto chars = _mm_loadu_si128(reinterpret_cast(data + size - chars_to_trim_right - bytes_sse)); - mask = _mm_cmpistri(whitespace_mask, chars, right_sse_mode); - chars_to_trim_right += mask; - } -#endif - /// skip remaining whitespace from right, character by character - while (chars_to_trim_right < trim_right_size && data[size - chars_to_trim_right - 1] == whitespace) - ++chars_to_trim_right; + const char * found = find_last_not_symbols_or_null<' '>(char_data, char_end); + if (found) + char_end = found + 1; + else + char_end = char_data; } - res_data = data + chars_to_trim_left; - res_size = size - chars_to_trim_left - chars_to_trim_right; + res_data = reinterpret_cast(char_data); + res_size = char_end - char_data; } }; diff --git a/dbms/tests/queries/0_stateless/00997_trim.reference b/dbms/tests/queries/0_stateless/00997_trim.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/00997_trim.sql b/dbms/tests/queries/0_stateless/00997_trim.sql new file mode 100644 index 00000000000..7519877ec5e --- /dev/null +++ b/dbms/tests/queries/0_stateless/00997_trim.sql @@ -0,0 +1,20 @@ +WITH + '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' AS x, + replaceRegexpAll(x, '.', ' ') AS spaces, + concat(substring(spaces, 1, rand(1) % 62), substring(x, 1, rand(2) % 62), substring(spaces, 1, rand(3) % 62)) AS s, + trimLeft(s) AS sl, + trimRight(s) AS sr, + trimBoth(s) AS t, + replaceRegexpOne(s, '^ +', '') AS slr, + replaceRegexpOne(s, ' +$', '') AS srr, + replaceRegexpOne(s, '^ *(.*?) *$', '\\1') AS tr +SELECT + replaceAll(s, ' ', '_'), + replaceAll(sl, ' ', '_'), + replaceAll(slr, ' ', '_'), + replaceAll(sr, ' ', '_'), + replaceAll(srr, ' ', '_'), + replaceAll(t, ' ', '_'), + replaceAll(tr, ' ', '_') +FROM numbers(100000) +WHERE NOT ((sl = slr) AND (sr = srr) AND (t = tr)) diff --git a/libs/libcommon/include/common/find_symbols.h b/libs/libcommon/include/common/find_symbols.h index 68b49397683..162c73251fa 100644 --- a/libs/libcommon/include/common/find_symbols.h +++ b/libs/libcommon/include/common/find_symbols.h @@ -17,7 +17,7 @@ * but with the following differencies: * - works with any memory ranges, including containing zero bytes; * - doesn't require terminating zero byte: end of memory range is passed explicitly; - * - if not found, returns pointer to end instead of NULL; + * - if not found, returns pointer to end instead of nullptr; * - maximum number of symbols to search is 16. * * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, @@ -65,115 +65,154 @@ inline __m128i mm_is_in(__m128i bytes) } #endif - -template -inline const char * find_first_symbols_sse2(const char * begin, const char * end) +template +bool maybe_negate(bool x) { + if constexpr (positive) + return x; + else + return !x; +} + +template +uint16_t maybe_negate(uint16_t x) +{ + if constexpr (positive) + return x; + else + return ~x; +} + +enum class ReturnMode +{ + End, + Nullptr, +}; + + +template +inline const char * find_first_symbols_sse2(const char * const begin, const char * const end) +{ + const char * pos = begin; + #if defined(__SSE2__) - for (; begin + 15 < end; begin += 16) + for (; pos + 15 < end; pos += 16) { - __m128i bytes = _mm_loadu_si128(reinterpret_cast(begin)); + __m128i bytes = _mm_loadu_si128(reinterpret_cast(pos)); __m128i eq = mm_is_in(bytes); - uint16_t bit_mask = _mm_movemask_epi8(eq); + uint16_t bit_mask = maybe_negate(uint16_t(_mm_movemask_epi8(eq))); if (bit_mask) - return begin + __builtin_ctz(bit_mask); + return pos + __builtin_ctz(bit_mask); } #endif - for (; begin < end; ++begin) - if (is_in(*begin)) - return begin; - return end; + for (; pos < end; ++pos) + if (maybe_negate(is_in(*pos))) + return pos; + + return return_mode == ReturnMode::End ? end : nullptr; } -template -inline const char * find_last_symbols_or_null_sse2(const char * begin, const char * end) +template +inline const char * find_last_symbols_sse2(const char * const begin, const char * const end) { + const char * pos = end; + #if defined(__SSE2__) - for (; end - 16 >= begin; end -= 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. + for (; pos - 16 >= begin; pos -= 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. { - __m128i bytes = _mm_loadu_si128(reinterpret_cast(end - 16)); + __m128i bytes = _mm_loadu_si128(reinterpret_cast(pos - 16)); __m128i eq = mm_is_in(bytes); - uint16_t bit_mask = _mm_movemask_epi8(eq); + uint16_t bit_mask = maybe_negate(uint16_t(_mm_movemask_epi8(eq))); if (bit_mask) - return end - 1 - (__builtin_clz(bit_mask) - 16); /// because __builtin_clz works with mask as uint32. + return pos - 1 - (__builtin_clz(bit_mask) - 16); /// because __builtin_clz works with mask as uint32. } #endif - --end; - for (; end >= begin; --end) - if (is_in(*end)) - return end; + --pos; + for (; pos >= begin; --pos) + if (maybe_negate(is_in(*pos))) + return pos; - return nullptr; + return return_mode == ReturnMode::End ? end : nullptr; } -template -inline const char * find_first_symbols_sse42_impl(const char * begin, const char * end) +inline const char * find_first_symbols_sse42_impl(const char * const begin, const char * const end) { + const char * pos = begin; + #if defined(__SSE4_2__) #define MODE (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT) __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, c16); - for (; begin + 15 < end; begin += 16) + for (; pos + 15 < end; pos += 16) { - __m128i bytes = _mm_loadu_si128(reinterpret_cast(begin)); + __m128i bytes = _mm_loadu_si128(reinterpret_cast(pos)); - if (_mm_cmpestrc(set, num_chars, bytes, 16, MODE)) - return begin + _mm_cmpestri(set, num_chars, bytes, 16, MODE); + if constexpr (positive) + { + if (_mm_cmpestrc(set, num_chars, bytes, 16, MODE)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, MODE); + } + else + { + if (_mm_cmpestrc(set, num_chars, bytes, 16, MODE | _SIDD_NEGATIVE_POLARITY)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, MODE | _SIDD_NEGATIVE_POLARITY); + } } #undef MODE #endif - for (; begin < end; ++begin) - if ( (num_chars >= 1 && *begin == c01) - || (num_chars >= 2 && *begin == c02) - || (num_chars >= 3 && *begin == c03) - || (num_chars >= 4 && *begin == c04) - || (num_chars >= 5 && *begin == c05) - || (num_chars >= 6 && *begin == c06) - || (num_chars >= 7 && *begin == c07) - || (num_chars >= 8 && *begin == c08) - || (num_chars >= 9 && *begin == c09) - || (num_chars >= 10 && *begin == c10) - || (num_chars >= 11 && *begin == c11) - || (num_chars >= 12 && *begin == c12) - || (num_chars >= 13 && *begin == c13) - || (num_chars >= 14 && *begin == c14) - || (num_chars >= 15 && *begin == c15) - || (num_chars >= 16 && *begin == c16)) - return begin; - return end; + for (; pos < end; ++pos) + if ( (num_chars >= 1 && maybe_negate(*pos == c01)) + || (num_chars >= 2 && maybe_negate(*pos == c02)) + || (num_chars >= 3 && maybe_negate(*pos == c03)) + || (num_chars >= 4 && maybe_negate(*pos == c04)) + || (num_chars >= 5 && maybe_negate(*pos == c05)) + || (num_chars >= 6 && maybe_negate(*pos == c06)) + || (num_chars >= 7 && maybe_negate(*pos == c07)) + || (num_chars >= 8 && maybe_negate(*pos == c08)) + || (num_chars >= 9 && maybe_negate(*pos == c09)) + || (num_chars >= 10 && maybe_negate(*pos == c10)) + || (num_chars >= 11 && maybe_negate(*pos == c11)) + || (num_chars >= 12 && maybe_negate(*pos == c12)) + || (num_chars >= 13 && maybe_negate(*pos == c13)) + || (num_chars >= 14 && maybe_negate(*pos == c14)) + || (num_chars >= 15 && maybe_negate(*pos == c15)) + || (num_chars >= 16 && maybe_negate(*pos == c16))) + return pos; + return return_mode == ReturnMode::End ? end : nullptr; } -template +template inline const char * find_first_symbols_sse42(const char * begin, const char * end) { - return find_first_symbols_sse42_impl(begin, end); + return find_first_symbols_sse42_impl(begin, end); } /// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. -template +template inline const char * find_first_symbols_dispatch(const char * begin, const char * end) { #if defined(__SSE4_2__) if (sizeof...(symbols) >= 5) - return find_first_symbols_sse42(begin, end); + return find_first_symbols_sse42(begin, end); else #endif - return find_first_symbols_sse2(begin, end); + return find_first_symbols_sse2(begin, end); } } @@ -182,7 +221,7 @@ inline const char * find_first_symbols_dispatch(const char * begin, const char * template inline const char * find_first_symbols(const char * begin, const char * end) { - return detail::find_first_symbols_dispatch(begin, end); + return detail::find_first_symbols_dispatch(begin, end); } /// Returning non const result for non const arguments. @@ -190,18 +229,66 @@ inline const char * find_first_symbols(const char * begin, const char * end) template inline char * find_first_symbols(char * begin, char * end) { - return const_cast(detail::find_first_symbols_dispatch(begin, end)); + return const_cast(detail::find_first_symbols_dispatch(begin, end)); +} + +template +inline const char * find_first_not_symbols(const char * begin, const char * end) +{ + return detail::find_first_symbols_dispatch(begin, end); +} + +template +inline char * find_first_not_symbols(char * begin, char * end) +{ + return const_cast(detail::find_first_symbols_dispatch(begin, end)); +} + +template +inline const char * find_first_symbols_or_null(const char * begin, const char * end) +{ + return detail::find_first_symbols_dispatch(begin, end); +} + +template +inline char * find_first_symbols_or_null(char * begin, char * end) +{ + return const_cast(detail::find_first_symbols_dispatch(begin, end)); +} + +template +inline const char * find_first_not_symbols_or_null(const char * begin, const char * end) +{ + return detail::find_first_symbols_dispatch(begin, end); +} + +template +inline char * find_first_not_symbols_or_null(char * begin, char * end) +{ + return const_cast(detail::find_first_symbols_dispatch(begin, end)); } template inline const char * find_last_symbols_or_null(const char * begin, const char * end) { - return detail::find_last_symbols_or_null_sse2(begin, end); + return detail::find_last_symbols_sse2(begin, end); } template inline char * find_last_symbols_or_null(char * begin, char * end) { - return const_cast(detail::find_last_symbols_or_null_sse2(begin, end)); + return const_cast(detail::find_last_symbols_sse2(begin, end)); +} + +template +inline const char * find_last_not_symbols_or_null(const char * begin, const char * end) +{ + return detail::find_last_symbols_sse2(begin, end); +} + +template +inline char * find_last_not_symbols_or_null(char * begin, char * end) +{ + return const_cast(detail::find_last_symbols_sse2(begin, end)); }