diff --git a/src/Functions/LowerUpperImpl.h b/src/Functions/LowerUpperImpl.h index a7c38a7f904..ca3b7173032 100644 --- a/src/Functions/LowerUpperImpl.h +++ b/src/Functions/LowerUpperImpl.h @@ -29,31 +29,61 @@ private: { const auto flip_case_mask = 'A' ^ 'a'; +#if defined(__AVX512F__) && defined(__AVX512BW__) /// check if avx512 instructions are compiled + if (isArchSupported(TargetArch::AVX512BW)) + { + /// check if cpu support avx512 dynamically, haveAVX512BW contains check of haveAVX512F + const auto byte_avx512 = sizeof(__m512i); + const auto src_end_avx = src_end - (src_end - src) % byte_avx512; + if (src < src_end_avx) { + const auto v_not_case_lower_bound = _mm512_set1_epi8(not_case_lower_bound - 1); + const auto v_not_case_upper_bound = _mm512_set1_epi8(not_case_upper_bound + 1); + const auto v_flip_case_mask = _mm512_set1_epi8(flip_case_mask); + + for (; src < src_end_avx; src += byte_avx512, dst += byte_avx512) { + const auto chars = _mm512_loadu_si512(reinterpret_cast(src)); + + const auto is_not_case + = _mm512_and_si512(_mm512_movm_epi8(_mm512_cmpgt_epi8_mask(chars, v_not_case_lower_bound)), + _mm512_movm_epi8(_mm512_cmplt_epi8_mask(chars, v_not_case_upper_bound))); + + const auto xor_mask = _mm512_and_si512(v_flip_case_mask, is_not_case); + + const auto cased_chars = _mm512_xor_si512(chars, xor_mask); + + _mm512_storeu_si512(reinterpret_cast<__m512i *>(dst), cased_chars); + } + } + } +#endif + #ifdef __SSE2__ const auto bytes_sse = sizeof(__m128i); const auto * src_end_sse = src_end - (src_end - src) % bytes_sse; - - const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1); - const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1); - const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask); - - for (; src < src_end_sse; src += bytes_sse, dst += bytes_sse) + if (src < src_end_sse) { - /// load 16 sequential 8-bit characters - const auto chars = _mm_loadu_si128(reinterpret_cast(src)); + const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1); + const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1); + const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask); - /// find which 8-bit sequences belong to range [case_lower_bound, case_upper_bound] - const auto is_not_case - = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound), _mm_cmplt_epi8(chars, v_not_case_upper_bound)); + for (; src < src_end_sse; src += bytes_sse, dst += bytes_sse) + { + /// load 16 sequential 8-bit characters + const auto chars = _mm_loadu_si128(reinterpret_cast(src)); - /// keep `flip_case_mask` only where necessary, zero out elsewhere - const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case); + /// find which 8-bit sequences belong to range [case_lower_bound, case_upper_bound] + const auto is_not_case + = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound), _mm_cmplt_epi8(chars, v_not_case_upper_bound)); - /// flip case by applying calculated mask - const auto cased_chars = _mm_xor_si128(chars, xor_mask); + /// keep `flip_case_mask` only where necessary, zero out elsewhere + const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case); - /// store result back to destination - _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), cased_chars); + /// flip case by applying calculated mask + const auto cased_chars = _mm_xor_si128(chars, xor_mask); + + /// store result back to destination + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), cased_chars); + } } #endif