diff --git a/src/Common/StringUtils.cpp b/src/Common/StringUtils.cpp index f61a39851e2..18577e64c01 100644 --- a/src/Common/StringUtils.cpp +++ b/src/Common/StringUtils.cpp @@ -1,4 +1,10 @@ -#include "StringUtils.h" +#include + +#include + +#if USE_MULTITARGET_CODE +#include +#endif namespace impl @@ -15,3 +21,67 @@ bool endsWith(const std::string & s, const char * suffix, size_t suffix_size) } } + +DECLARE_DEFAULT_CODE( +bool isAllASCII(const UInt8 * data, size_t size) +{ + UInt8 mask = 0; + for (size_t i = 0; i < size; ++i) + mask |= data[i]; + + return !(mask & 0x80); +}) + +DECLARE_SSE42_SPECIFIC_CODE( +/// Copy from https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h +bool isAllASCII(const UInt8 * data, size_t size) +{ + __m128i masks = _mm_setzero_si128(); + + size_t i = 0; + for (; i + 16 <= size; i += 16) + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast(data + i)); + masks = _mm_or_si128(masks, bytes); + } + int mask = _mm_movemask_epi8(masks); + + UInt8 tail_mask = 0; + for (; i < size; i++) + tail_mask |= data[i]; + + mask |= (tail_mask & 0x80); + return !mask; +}) + +DECLARE_AVX2_SPECIFIC_CODE( +bool isAllASCII(const UInt8 * data, size_t size) +{ + __m256i masks = _mm256_setzero_si256(); + + size_t i = 0; + for (; i + 32 <= size; i += 32) + { + __m256i bytes = _mm256_loadu_si256(reinterpret_cast(data + i)); + masks = _mm256_or_si256(masks, bytes); + } + int mask = _mm256_movemask_epi8(masks); + + UInt8 tail_mask = 0; + for (; i < size; i++) + tail_mask |= data[i]; + + mask |= (tail_mask & 0x80); + return !mask; +}) + +bool isAllASCII(const UInt8 * data, size_t size) +{ +#if USE_MULTITARGET_CODE + if (isArchSupported(DB::TargetArch::AVX2)) + return TargetSpecific::AVX2::isAllASCII(data, size); + if (isArchSupported(DB::TargetArch::SSE42)) + return TargetSpecific::SSE42::isAllASCII(data, size); +#endif + return TargetSpecific::Default::isAllASCII(data, size); +} diff --git a/src/Common/StringUtils.h b/src/Common/StringUtils.h index 051e4338714..fe5fc3c058f 100644 --- a/src/Common/StringUtils.h +++ b/src/Common/StringUtils.h @@ -7,6 +7,8 @@ #include #include +#include + namespace impl { @@ -315,6 +317,9 @@ inline void trim(std::string & str, char c = ' ') trimLeft(str, c); } +/// If all characters in the string are ASCII, return true +bool isAllASCII(const UInt8 * data, size_t size); + constexpr bool containsGlobs(const std::string & str) { return str.find_first_of("*?{") != std::string::npos; diff --git a/src/Common/UTF8Helpers.cpp b/src/Common/UTF8Helpers.cpp index 3c3cf61bbfc..8c8c8e8327b 100644 --- a/src/Common/UTF8Helpers.cpp +++ b/src/Common/UTF8Helpers.cpp @@ -1,14 +1,9 @@ -#include -#include #include +#include #include #include -#if USE_MULTITARGET_CODE -#include -#endif - namespace DB { namespace UTF8 @@ -208,7 +203,6 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l } - size_t computeWidth(const UInt8 * data, size_t size, size_t prefix) noexcept { return computeWidthImpl(data, size, prefix, 0); @@ -219,71 +213,5 @@ size_t computeBytesBeforeWidth(const UInt8 * data, size_t size, size_t prefix, s return computeWidthImpl(data, size, prefix, limit); } - -DECLARE_DEFAULT_CODE( -bool isAllASCII(const UInt8 * data, size_t size) -{ - UInt8 mask = 0; - for (size_t i = 0; i < size; ++i) - mask |= data[i]; - - return !(mask & 0x80); -}) - -DECLARE_SSE42_SPECIFIC_CODE( -/// Copy from https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h -bool isAllASCII(const UInt8 * data, size_t size) -{ - __m128i masks = _mm_setzero_si128(); - - size_t i = 0; - for (; i + 16 <= size; i += 16) - { - __m128i bytes = _mm_loadu_si128(reinterpret_cast(data + i)); - masks = _mm_or_si128(masks, bytes); - } - int mask = _mm_movemask_epi8(masks); - - UInt8 tail_mask = 0; - for (; i < size; i++) - tail_mask |= data[i]; - - mask |= (tail_mask & 0x80); - return !mask; -}) - -DECLARE_AVX2_SPECIFIC_CODE( -bool isAllASCII(const UInt8 * data, size_t size) -{ - __m256i masks = _mm256_setzero_si256(); - - size_t i = 0; - for (; i + 32 <= size; i += 32) - { - __m256i bytes = _mm256_loadu_si256(reinterpret_cast(data + i)); - masks = _mm256_or_si256(masks, bytes); - } - int mask = _mm256_movemask_epi8(masks); - - UInt8 tail_mask = 0; - for (; i < size; i++) - tail_mask |= data[i]; - - mask |= (tail_mask & 0x80); - return !mask; -}) - -bool isAllASCII(const UInt8* data, size_t size) -{ -#if USE_MULTITARGET_CODE - if (isArchSupported(TargetArch::AVX2)) - return TargetSpecific::AVX2::isAllASCII(data, size); - if (isArchSupported(TargetArch::SSE42)) - return TargetSpecific::SSE42::isAllASCII(data, size); -#endif - return TargetSpecific::Default::isAllASCII(data, size); -} - - } } diff --git a/src/Common/UTF8Helpers.h b/src/Common/UTF8Helpers.h index 933b62c7b63..b09d92bd731 100644 --- a/src/Common/UTF8Helpers.h +++ b/src/Common/UTF8Helpers.h @@ -136,10 +136,6 @@ size_t computeWidth(const UInt8 * data, size_t size, size_t prefix = 0) noexcept */ size_t computeBytesBeforeWidth(const UInt8 * data, size_t size, size_t prefix, size_t limit) noexcept; - -/// If all the characters in the string are ASCII, return true. -bool isAllASCII(const UInt8* data, size_t size); - } } diff --git a/src/Functions/LowerUpperUTF8Impl.h b/src/Functions/LowerUpperUTF8Impl.h index bb794a0f8ed..eebba7b9d5f 100644 --- a/src/Functions/LowerUpperUTF8Impl.h +++ b/src/Functions/LowerUpperUTF8Impl.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #ifdef __SSE2__ @@ -94,7 +95,7 @@ struct LowerUpperUTF8Impl if (data.empty()) return; - bool all_ascii = UTF8::isAllASCII(data.data(), data.size()); + bool all_ascii = isAllASCII(data.data(), data.size()); if (all_ascii) { LowerUpperImpl::vector(data, offsets, res_data, res_offsets); diff --git a/src/Functions/padString.cpp b/src/Functions/padString.cpp index 0922e0ddb8a..8670c837e21 100644 --- a/src/Functions/padString.cpp +++ b/src/Functions/padString.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -237,8 +238,8 @@ namespace void executeForSource(SourceStrings && strings, const ColumnPtr & column_length, const String & pad_string, StringSink & res_sink) const { const auto & chars = strings.getElements(); - bool all_ascii = UTF8::isAllASCII(reinterpret_cast(pad_string.data()), pad_string.size()) - && UTF8::isAllASCII(chars.data(), chars.size()); + bool all_ascii = isAllASCII(reinterpret_cast(pad_string.data()), pad_string.size()) + && isAllASCII(chars.data(), chars.size()); bool is_actually_utf8 = is_utf8 && !all_ascii; if (!is_actually_utf8) diff --git a/src/Functions/reverseUTF8.cpp b/src/Functions/reverseUTF8.cpp index 4ea861919a1..1aee349fa8d 100644 --- a/src/Functions/reverseUTF8.cpp +++ b/src/Functions/reverseUTF8.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "reverse.h" @@ -27,7 +28,7 @@ struct ReverseUTF8Impl ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { - bool all_ascii = UTF8::isAllASCII(data.data(), data.size()); + bool all_ascii = isAllASCII(data.data(), data.size()); if (all_ascii) { ReverseImpl::vector(data, offsets, res_data, res_offsets); diff --git a/src/Functions/substring.cpp b/src/Functions/substring.cpp index 122f83d758b..f1dea7db018 100644 --- a/src/Functions/substring.cpp +++ b/src/Functions/substring.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -149,7 +150,7 @@ public: { if (const ColumnString * col = checkAndGetColumn(column_string.get())) { - bool all_ascii = UTF8::isAllASCII(col->getChars().data(), col->getChars().size()); + bool all_ascii = isAllASCII(col->getChars().data(), col->getChars().size()); if (all_ascii) return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, StringSource(*col), input_rows_count); else @@ -159,7 +160,7 @@ public: if (const ColumnConst * col_const = checkAndGetColumnConst(column_string.get())) { StringRef str_ref = col_const->getDataAt(0); - bool all_ascii = UTF8::isAllASCII(reinterpret_cast(str_ref.data), str_ref.size); + bool all_ascii = isAllASCII(reinterpret_cast(str_ref.data), str_ref.size); if (all_ascii) return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); else diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index 74474cb4b23..15a321bd5b0 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -129,8 +130,8 @@ namespace res_data.reserve(str_column->getChars().size() / 2); res_offsets.reserve(rows); - bool all_ascii = UTF8::isAllASCII(str_column->getChars().data(), str_column->getChars().size()) - && UTF8::isAllASCII(reinterpret_cast(delim.data()), delim.size()); + bool all_ascii = isAllASCII(str_column->getChars().data(), str_column->getChars().size()) + && isAllASCII(reinterpret_cast(delim.data()), delim.size()); std::unique_ptr searcher = !is_utf8 || all_ascii ? nullptr : std::make_unique(delim.data(), delim.size()); @@ -162,8 +163,8 @@ namespace res_data.reserve(str_column->getChars().size() / 2); res_offsets.reserve(rows); - bool all_ascii = UTF8::isAllASCII(str_column->getChars().data(), str_column->getChars().size()) - && UTF8::isAllASCII(reinterpret_cast(delim.data()), delim.size()); + bool all_ascii = isAllASCII(str_column->getChars().data(), str_column->getChars().size()) + && isAllASCII(reinterpret_cast(delim.data()), delim.size()); std::unique_ptr searcher = !is_utf8 || all_ascii ? nullptr : std::make_unique(delim.data(), delim.size()); @@ -194,8 +195,8 @@ namespace res_data.reserve(str.size() * rows / 2); res_offsets.reserve(rows); - bool all_ascii = UTF8::isAllASCII(reinterpret_cast(str.data()), str.size()) - && UTF8::isAllASCII(reinterpret_cast(delim.data()), delim.size()); + bool all_ascii = isAllASCII(reinterpret_cast(str.data()), str.size()) + && isAllASCII(reinterpret_cast(delim.data()), delim.size()); std::unique_ptr searcher = !is_utf8 || all_ascii ? nullptr : std::make_unique(delim.data(), delim.size());