mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Merge pull request #37777 from ClickHouse/avx512_tail_zero
Use multitarget framework for numZerosInTail implementation
This commit is contained in:
commit
f860407af5
@ -95,12 +95,15 @@ String toString(TargetArch arch);
|
||||
|
||||
#if defined(__clang__)
|
||||
|
||||
#define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw")))
|
||||
#define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f")))
|
||||
#define AVX2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2")))
|
||||
#define AVX_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx"))
|
||||
#define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt")))
|
||||
#define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE
|
||||
|
||||
# define BEGIN_AVX512BW_SPECIFIC_CODE \
|
||||
_Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw\"))),apply_to=function)")
|
||||
# define BEGIN_AVX512F_SPECIFIC_CODE \
|
||||
_Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f\"))),apply_to=function)")
|
||||
# define BEGIN_AVX2_SPECIFIC_CODE \
|
||||
@ -118,12 +121,16 @@ String toString(TargetArch arch);
|
||||
# define DUMMY_FUNCTION_DEFINITION [[maybe_unused]] void _dummy_function_definition();
|
||||
#else
|
||||
|
||||
#define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,tune=native")))
|
||||
#define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,tune=native")))
|
||||
#define AVX2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,tune=native")))
|
||||
#define AVX_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,tune=native")))
|
||||
#define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt",tune=native))))
|
||||
#define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE
|
||||
|
||||
# define BEGIN_AVX512BW_SPECIFIC_CODE \
|
||||
_Pragma("GCC push_options") \
|
||||
_Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,tune=native\")")
|
||||
# define BEGIN_AVX512F_SPECIFIC_CODE \
|
||||
_Pragma("GCC push_options") \
|
||||
_Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,tune=native\")")
|
||||
@ -180,6 +187,15 @@ namespace TargetSpecific::AVX512F { \
|
||||
} \
|
||||
END_TARGET_SPECIFIC_CODE
|
||||
|
||||
#define DECLARE_AVX512BW_SPECIFIC_CODE(...) \
|
||||
BEGIN_AVX512BW_SPECIFIC_CODE \
|
||||
namespace TargetSpecific::AVX512BW { \
|
||||
DUMMY_FUNCTION_DEFINITION \
|
||||
using namespace DB::TargetSpecific::AVX512BW; \
|
||||
__VA_ARGS__ \
|
||||
} \
|
||||
END_TARGET_SPECIFIC_CODE
|
||||
|
||||
#else
|
||||
|
||||
#define USE_MULTITARGET_CODE 0
|
||||
@ -190,6 +206,7 @@ END_TARGET_SPECIFIC_CODE
|
||||
#define DECLARE_AVX_SPECIFIC_CODE(...)
|
||||
#define DECLARE_AVX2_SPECIFIC_CODE(...)
|
||||
#define DECLARE_AVX512F_SPECIFIC_CODE(...)
|
||||
#define DECLARE_AVX512BW_SPECIFIC_CODE(...)
|
||||
|
||||
#endif
|
||||
|
||||
@ -205,7 +222,8 @@ DECLARE_DEFAULT_CODE (__VA_ARGS__) \
|
||||
DECLARE_SSE42_SPECIFIC_CODE (__VA_ARGS__) \
|
||||
DECLARE_AVX_SPECIFIC_CODE (__VA_ARGS__) \
|
||||
DECLARE_AVX2_SPECIFIC_CODE (__VA_ARGS__) \
|
||||
DECLARE_AVX512F_SPECIFIC_CODE(__VA_ARGS__)
|
||||
DECLARE_AVX512F_SPECIFIC_CODE(__VA_ARGS__) \
|
||||
DECLARE_AVX512BW_SPECIFIC_CODE(__VA_ARGS__)
|
||||
|
||||
DECLARE_DEFAULT_CODE(
|
||||
constexpr auto BuildArch = TargetArch::Default; /// NOLINT
|
||||
@ -227,6 +245,10 @@ DECLARE_AVX512F_SPECIFIC_CODE(
|
||||
constexpr auto BuildArch = TargetArch::AVX512F; /// NOLINT
|
||||
) // DECLARE_AVX512F_SPECIFIC_CODE
|
||||
|
||||
DECLARE_AVX512BW_SPECIFIC_CODE(
|
||||
constexpr auto BuildArch = TargetArch::AVX512BW; /// NOLINT
|
||||
) // DECLARE_AVX512BW_SPECIFIC_CODE
|
||||
|
||||
/** Runtime Dispatch helpers for class members.
|
||||
*
|
||||
* Example of usage:
|
||||
|
@ -447,37 +447,82 @@ void MergeTreeRangeReader::ReadResult::collapseZeroTails(const IColumn::Filter &
|
||||
new_filter_vec.resize(new_filter_data - new_filter_vec.data());
|
||||
}
|
||||
|
||||
size_t MergeTreeRangeReader::ReadResult::numZerosInTail(const UInt8 * begin, const UInt8 * end)
|
||||
DECLARE_AVX512BW_SPECIFIC_CODE(
|
||||
size_t numZerosInTail(const UInt8 * begin, const UInt8 * end)
|
||||
{
|
||||
size_t count = 0;
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512BW__) /// check if avx512 instructions are compiled
|
||||
if (isArchSupported(TargetArch::AVX512BW))
|
||||
const __m512i zero64 = _mm512_setzero_epi32();
|
||||
while (end - begin >= 64)
|
||||
{
|
||||
/// check if cpu support avx512 dynamically, haveAVX512BW contains check of haveAVX512F
|
||||
const __m512i zero64 = _mm512_setzero_epi32();
|
||||
while (end - begin >= 64)
|
||||
end -= 64;
|
||||
const auto * pos = end;
|
||||
UInt64 val = static_cast<UInt64>(_mm512_cmp_epi8_mask(
|
||||
_mm512_loadu_si512(reinterpret_cast<const __m512i *>(pos)),
|
||||
zero64,
|
||||
_MM_CMPINT_EQ));
|
||||
val = ~val;
|
||||
if (val == 0)
|
||||
count += 64;
|
||||
else
|
||||
{
|
||||
end -= 64;
|
||||
const auto * pos = end;
|
||||
UInt64 val = static_cast<UInt64>(_mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(pos)), zero64, _MM_CMPINT_EQ));
|
||||
val = ~val;
|
||||
if (val == 0)
|
||||
count += 64;
|
||||
else
|
||||
{
|
||||
count += __builtin_clzll(val);
|
||||
return count;
|
||||
}
|
||||
count += __builtin_clzll(val);
|
||||
return count;
|
||||
}
|
||||
while (end > begin && *(--end) == 0)
|
||||
{
|
||||
++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
while (end > begin && *(--end) == 0)
|
||||
{
|
||||
++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
) /// DECLARE_AVX512BW_SPECIFIC_CODE
|
||||
|
||||
DECLARE_AVX2_SPECIFIC_CODE(
|
||||
size_t numZerosInTail(const UInt8 * begin, const UInt8 * end)
|
||||
{
|
||||
size_t count = 0;
|
||||
const __m256i zero32 = _mm256_setzero_si256();
|
||||
while (end - begin >= 64)
|
||||
{
|
||||
end -= 64;
|
||||
const auto * pos = end;
|
||||
UInt64 val =
|
||||
(static_cast<UInt64>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(pos)),
|
||||
zero32))) & 0xffffffffu)
|
||||
| (static_cast<UInt64>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(pos + 32)),
|
||||
zero32))) << 32u);
|
||||
|
||||
val = ~val;
|
||||
if (val == 0)
|
||||
count += 64;
|
||||
else
|
||||
{
|
||||
count += __builtin_clzll(val);
|
||||
return count;
|
||||
}
|
||||
}
|
||||
while (end > begin && *(--end) == 0)
|
||||
{
|
||||
++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
) /// DECLARE_AVX2_SPECIFIC_CODE
|
||||
|
||||
size_t MergeTreeRangeReader::ReadResult::numZerosInTail(const UInt8 * begin, const UInt8 * end)
|
||||
{
|
||||
#if USE_MULTITARGET_CODE
|
||||
/// check if cpu support avx512 dynamically, haveAVX512BW contains check of haveAVX512F
|
||||
if (isArchSupported(TargetArch::AVX512BW))
|
||||
return TargetSpecific::AVX512BW::numZerosInTail(begin, end);
|
||||
else if (isArchSupported(TargetArch::AVX2))
|
||||
return TargetSpecific::AVX2::numZerosInTail(begin, end);
|
||||
#endif
|
||||
|
||||
size_t count = 0;
|
||||
|
||||
#if defined(__SSE2__) && defined(__POPCNT__)
|
||||
const __m128i zero16 = _mm_setzero_si128();
|
||||
while (end - begin >= 64)
|
||||
|
Loading…
Reference in New Issue
Block a user