Merge pull request #37777 from ClickHouse/avx512_tail_zero

Use multitarget framework for numZerosInTail implementation
This commit is contained in:
Alexey Milovidov 2022-06-04 01:46:56 +03:00 committed by GitHub
commit f860407af5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 91 additions and 24 deletions

View File

@ -95,12 +95,15 @@ String toString(TargetArch arch);
#if defined(__clang__)
#define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw")))
#define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f")))
#define AVX2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2")))
#define AVX_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx"))
#define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt")))
#define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE
# define BEGIN_AVX512BW_SPECIFIC_CODE \
_Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw\"))),apply_to=function)")
# define BEGIN_AVX512F_SPECIFIC_CODE \
_Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f\"))),apply_to=function)")
# define BEGIN_AVX2_SPECIFIC_CODE \
@ -118,12 +121,16 @@ String toString(TargetArch arch);
# define DUMMY_FUNCTION_DEFINITION [[maybe_unused]] void _dummy_function_definition();
#else
#define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,tune=native")))
#define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,tune=native")))
#define AVX2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,tune=native")))
#define AVX_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,tune=native")))
#define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt",tune=native))))
#define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE
# define BEGIN_AVX512BW_SPECIFIC_CODE \
_Pragma("GCC push_options") \
_Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,tune=native\")")
# define BEGIN_AVX512F_SPECIFIC_CODE \
_Pragma("GCC push_options") \
_Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,tune=native\")")
@ -180,6 +187,15 @@ namespace TargetSpecific::AVX512F { \
} \
END_TARGET_SPECIFIC_CODE
#define DECLARE_AVX512BW_SPECIFIC_CODE(...) \
BEGIN_AVX512BW_SPECIFIC_CODE \
namespace TargetSpecific::AVX512BW { \
DUMMY_FUNCTION_DEFINITION \
using namespace DB::TargetSpecific::AVX512BW; \
__VA_ARGS__ \
} \
END_TARGET_SPECIFIC_CODE
#else
#define USE_MULTITARGET_CODE 0
@ -190,6 +206,7 @@ END_TARGET_SPECIFIC_CODE
#define DECLARE_AVX_SPECIFIC_CODE(...)
#define DECLARE_AVX2_SPECIFIC_CODE(...)
#define DECLARE_AVX512F_SPECIFIC_CODE(...)
#define DECLARE_AVX512BW_SPECIFIC_CODE(...)
#endif
@ -205,7 +222,8 @@ DECLARE_DEFAULT_CODE (__VA_ARGS__) \
DECLARE_SSE42_SPECIFIC_CODE (__VA_ARGS__) \
DECLARE_AVX_SPECIFIC_CODE (__VA_ARGS__) \
DECLARE_AVX2_SPECIFIC_CODE (__VA_ARGS__) \
DECLARE_AVX512F_SPECIFIC_CODE(__VA_ARGS__)
DECLARE_AVX512F_SPECIFIC_CODE(__VA_ARGS__) \
DECLARE_AVX512BW_SPECIFIC_CODE(__VA_ARGS__)
DECLARE_DEFAULT_CODE(
constexpr auto BuildArch = TargetArch::Default; /// NOLINT
@ -227,6 +245,10 @@ DECLARE_AVX512F_SPECIFIC_CODE(
constexpr auto BuildArch = TargetArch::AVX512F; /// NOLINT
) // DECLARE_AVX512F_SPECIFIC_CODE
DECLARE_AVX512BW_SPECIFIC_CODE(
constexpr auto BuildArch = TargetArch::AVX512BW; /// NOLINT
) // DECLARE_AVX512BW_SPECIFIC_CODE
/** Runtime Dispatch helpers for class members.
*
* Example of usage:

View File

@ -447,37 +447,82 @@ void MergeTreeRangeReader::ReadResult::collapseZeroTails(const IColumn::Filter &
new_filter_vec.resize(new_filter_data - new_filter_vec.data());
}
size_t MergeTreeRangeReader::ReadResult::numZerosInTail(const UInt8 * begin, const UInt8 * end)
DECLARE_AVX512BW_SPECIFIC_CODE(
size_t numZerosInTail(const UInt8 * begin, const UInt8 * end)
{
size_t count = 0;
#if defined(__AVX512F__) && defined(__AVX512BW__) /// check if avx512 instructions are compiled
if (isArchSupported(TargetArch::AVX512BW))
const __m512i zero64 = _mm512_setzero_epi32();
while (end - begin >= 64)
{
/// check if cpu support avx512 dynamically, haveAVX512BW contains check of haveAVX512F
const __m512i zero64 = _mm512_setzero_epi32();
while (end - begin >= 64)
end -= 64;
const auto * pos = end;
UInt64 val = static_cast<UInt64>(_mm512_cmp_epi8_mask(
_mm512_loadu_si512(reinterpret_cast<const __m512i *>(pos)),
zero64,
_MM_CMPINT_EQ));
val = ~val;
if (val == 0)
count += 64;
else
{
end -= 64;
const auto * pos = end;
UInt64 val = static_cast<UInt64>(_mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(pos)), zero64, _MM_CMPINT_EQ));
val = ~val;
if (val == 0)
count += 64;
else
{
count += __builtin_clzll(val);
return count;
}
count += __builtin_clzll(val);
return count;
}
while (end > begin && *(--end) == 0)
{
++count;
}
return count;
}
while (end > begin && *(--end) == 0)
{
++count;
}
return count;
}
) /// DECLARE_AVX512BW_SPECIFIC_CODE
DECLARE_AVX2_SPECIFIC_CODE(
size_t numZerosInTail(const UInt8 * begin, const UInt8 * end)
{
size_t count = 0;
const __m256i zero32 = _mm256_setzero_si256();
while (end - begin >= 64)
{
end -= 64;
const auto * pos = end;
UInt64 val =
(static_cast<UInt64>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(pos)),
zero32))) & 0xffffffffu)
| (static_cast<UInt64>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(pos + 32)),
zero32))) << 32u);
val = ~val;
if (val == 0)
count += 64;
else
{
count += __builtin_clzll(val);
return count;
}
}
while (end > begin && *(--end) == 0)
{
++count;
}
return count;
}
) /// DECLARE_AVX2_SPECIFIC_CODE
size_t MergeTreeRangeReader::ReadResult::numZerosInTail(const UInt8 * begin, const UInt8 * end)
{
#if USE_MULTITARGET_CODE
/// check if cpu support avx512 dynamically, haveAVX512BW contains check of haveAVX512F
if (isArchSupported(TargetArch::AVX512BW))
return TargetSpecific::AVX512BW::numZerosInTail(begin, end);
else if (isArchSupported(TargetArch::AVX2))
return TargetSpecific::AVX2::numZerosInTail(begin, end);
#endif
size_t count = 0;
#if defined(__SSE2__) && defined(__POPCNT__)
const __m128i zero16 = _mm_setzero_si128();
while (end - begin >= 64)