mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Revert "ColumnVector: optimize filter with AVX512VBMI2 compress store"
This commit is contained in:
parent
5ab1eca788
commit
5524706b78
@ -12,14 +12,12 @@
|
||||
#include <Common/RadixSort.h>
|
||||
#include <Common/SipHash.h>
|
||||
#include <Common/WeakHash.h>
|
||||
#include <Common/TargetSpecific.h>
|
||||
#include <Common/assert_cast.h>
|
||||
#include <base/sort.h>
|
||||
#include <base/unaligned.h>
|
||||
#include <base/bit_cast.h>
|
||||
#include <base/scope_guard.h>
|
||||
|
||||
#include <bit>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
|
||||
@ -27,10 +25,6 @@
|
||||
# include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#if USE_MULTITARGET_CODE
|
||||
# include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if USE_EMBEDDED_COMPILER
|
||||
#include <DataTypes/Native.h>
|
||||
#include <llvm/IR/IRBuilder.h>
|
||||
@ -477,115 +471,6 @@ void ColumnVector<T>::insertRangeFrom(const IColumn & src, size_t start, size_t
|
||||
memcpy(data.data() + old_size, &src_vec.data[start], length * sizeof(data[0]));
|
||||
}
|
||||
|
||||
static inline UInt64 blsr(UInt64 mask)
|
||||
{
|
||||
#ifdef __BMI__
|
||||
return _blsr_u64(mask);
|
||||
#else
|
||||
return mask & (mask-1);
|
||||
#endif
|
||||
}
|
||||
|
||||
DECLARE_DEFAULT_CODE(
|
||||
template <typename T, typename Container, size_t SIMD_BYTES>
|
||||
inline void doFilterAligned(const UInt8 *& filt_pos, const UInt8 *& filt_end_aligned, const T *& data_pos, Container & res_data)
|
||||
{
|
||||
while (filt_pos < filt_end_aligned)
|
||||
{
|
||||
UInt64 mask = bytes64MaskToBits64Mask(filt_pos);
|
||||
|
||||
if (0xffffffffffffffff == mask)
|
||||
{
|
||||
res_data.insert(data_pos, data_pos + SIMD_BYTES);
|
||||
}
|
||||
else
|
||||
{
|
||||
while (mask)
|
||||
{
|
||||
size_t index = std::countr_zero(mask);
|
||||
res_data.push_back(data_pos[index]);
|
||||
mask = blsr(mask);
|
||||
}
|
||||
}
|
||||
|
||||
filt_pos += SIMD_BYTES;
|
||||
data_pos += SIMD_BYTES;
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
DECLARE_AVX512VBMI2_SPECIFIC_CODE(
|
||||
template <size_t ELEMENT_WIDTH>
|
||||
inline void compressStoreAVX512(const void *src, void *dst, const UInt64 mask)
|
||||
{
|
||||
__m512i vsrc = _mm512_loadu_si512(src);
|
||||
if constexpr (ELEMENT_WIDTH == 1)
|
||||
_mm512_mask_compressstoreu_epi8(dst, static_cast<__mmask64>(mask), vsrc);
|
||||
else if constexpr (ELEMENT_WIDTH == 2)
|
||||
_mm512_mask_compressstoreu_epi16(dst, static_cast<__mmask32>(mask), vsrc);
|
||||
else if constexpr (ELEMENT_WIDTH == 4)
|
||||
_mm512_mask_compressstoreu_epi32(dst, static_cast<__mmask16>(mask), vsrc);
|
||||
else if constexpr (ELEMENT_WIDTH == 8)
|
||||
_mm512_mask_compressstoreu_epi64(dst, static_cast<__mmask8>(mask), vsrc);
|
||||
}
|
||||
|
||||
template <typename T, typename Container, size_t SIMD_BYTES>
|
||||
inline void doFilterAligned(const UInt8 *& filt_pos, const UInt8 *& filt_end_aligned, const T *& data_pos, Container & res_data)
|
||||
{
|
||||
static constexpr size_t VEC_LEN = 64; /// AVX512 vector length - 64 bytes
|
||||
static constexpr size_t ELEMENT_WIDTH = sizeof(T);
|
||||
static constexpr size_t ELEMENTS_PER_VEC = VEC_LEN / ELEMENT_WIDTH;
|
||||
static constexpr UInt64 KMASK = 0xffffffffffffffff >> (64 - ELEMENTS_PER_VEC);
|
||||
|
||||
size_t current_offset = res_data.size();
|
||||
size_t reserve_size = res_data.size();
|
||||
size_t alloc_size = SIMD_BYTES * 2;
|
||||
|
||||
while (filt_pos < filt_end_aligned)
|
||||
{
|
||||
/// to avoid calling resize too frequently, resize to reserve buffer.
|
||||
if (reserve_size - current_offset < SIMD_BYTES)
|
||||
{
|
||||
reserve_size += alloc_size;
|
||||
res_data.resize(reserve_size);
|
||||
alloc_size *= 2;
|
||||
}
|
||||
|
||||
UInt64 mask = bytes64MaskToBits64Mask(filt_pos);
|
||||
|
||||
if (0xffffffffffffffff == mask)
|
||||
{
|
||||
for (size_t i = 0; i < SIMD_BYTES; i += ELEMENTS_PER_VEC)
|
||||
_mm512_storeu_si512(reinterpret_cast<void *>(&res_data[current_offset + i]),
|
||||
_mm512_loadu_si512(reinterpret_cast<const void *>(data_pos + i)));
|
||||
current_offset += SIMD_BYTES;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (mask)
|
||||
{
|
||||
for (size_t i = 0; i < SIMD_BYTES; i += ELEMENTS_PER_VEC)
|
||||
{
|
||||
compressStoreAVX512<ELEMENT_WIDTH>(reinterpret_cast<const void *>(data_pos + i),
|
||||
reinterpret_cast<void *>(&res_data[current_offset]), mask & KMASK);
|
||||
current_offset += std::popcount(mask & KMASK);
|
||||
/// prepare mask for next iter, if ELEMENTS_PER_VEC = 64, no next iter
|
||||
if (ELEMENTS_PER_VEC < 64)
|
||||
{
|
||||
mask >>= ELEMENTS_PER_VEC;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
filt_pos += SIMD_BYTES;
|
||||
data_pos += SIMD_BYTES;
|
||||
}
|
||||
/// resize to the real size.
|
||||
res_data.resize(current_offset);
|
||||
}
|
||||
)
|
||||
|
||||
template <typename T>
|
||||
ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_size_hint) const
|
||||
{
|
||||
@ -611,13 +496,31 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
|
||||
static constexpr size_t SIMD_BYTES = 64;
|
||||
const UInt8 * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
|
||||
#if USE_MULTITARGET_CODE
|
||||
static constexpr bool VBMI2_CAPABLE = sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8;
|
||||
if (VBMI2_CAPABLE && isArchSupported(TargetArch::AVX512VBMI2))
|
||||
TargetSpecific::AVX512VBMI2::doFilterAligned<T, Container, SIMD_BYTES>(filt_pos, filt_end_aligned, data_pos, res_data);
|
||||
else
|
||||
#endif
|
||||
TargetSpecific::Default::doFilterAligned<T, Container, SIMD_BYTES>(filt_pos, filt_end_aligned, data_pos, res_data);
|
||||
while (filt_pos < filt_end_aligned)
|
||||
{
|
||||
UInt64 mask = bytes64MaskToBits64Mask(filt_pos);
|
||||
|
||||
if (0xffffffffffffffff == mask)
|
||||
{
|
||||
res_data.insert(data_pos, data_pos + SIMD_BYTES);
|
||||
}
|
||||
else
|
||||
{
|
||||
while (mask)
|
||||
{
|
||||
size_t index = std::countr_zero(mask);
|
||||
res_data.push_back(data_pos[index]);
|
||||
#ifdef __BMI__
|
||||
mask = _blsr_u64(mask);
|
||||
#else
|
||||
mask = mask & (mask-1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
filt_pos += SIMD_BYTES;
|
||||
data_pos += SIMD_BYTES;
|
||||
}
|
||||
|
||||
while (filt_pos < filt_end)
|
||||
{
|
||||
|
@ -1,91 +0,0 @@
|
||||
#include <typeinfo>
|
||||
#include <vector>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Common/randomSeed.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
|
||||
using namespace DB;
|
||||
|
||||
static pcg64 rng(randomSeed());
|
||||
static constexpr int error_code = 12345;
|
||||
static constexpr size_t TEST_RUNS = 500;
|
||||
static constexpr size_t MAX_ROWS = 10000;
|
||||
static const std::vector<size_t> filter_ratios = {1, 2, 5, 11, 32, 64, 100, 1000};
|
||||
static const size_t K = filter_ratios.size();
|
||||
|
||||
template <typename T>
|
||||
static MutableColumnPtr createColumn(size_t n)
|
||||
{
|
||||
auto column = ColumnVector<T>::create();
|
||||
auto & values = column->getData();
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
values.push_back(i);
|
||||
}
|
||||
|
||||
return column;
|
||||
}
|
||||
|
||||
bool checkFilter(const PaddedPODArray<UInt8> &flit, const IColumn & src, const IColumn & dst)
|
||||
{
|
||||
size_t n = flit.size();
|
||||
size_t dst_size = dst.size();
|
||||
size_t j = 0; /// index of dest
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
if (flit[i] != 0)
|
||||
{
|
||||
if ((dst_size <= j) || (src.compareAt(i, j, dst, 0) != 0))
|
||||
return false;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
return dst_size == j; /// filtered size check
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void testFilter()
|
||||
{
|
||||
auto test_case = [&](size_t rows, size_t filter_ratio)
|
||||
{
|
||||
auto vector_column = createColumn<T>(rows);
|
||||
PaddedPODArray<UInt8> flit(rows);
|
||||
for (size_t i = 0; i < rows; ++i)
|
||||
flit[i] = rng() % filter_ratio == 0;
|
||||
auto res_column = vector_column->filter(flit, -1);
|
||||
|
||||
if (!checkFilter(flit, *vector_column, *res_column))
|
||||
throw Exception(error_code, "VectorColumn filter failure, type: {}", typeid(T).name());
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
for (size_t i = 0; i < TEST_RUNS; ++i)
|
||||
{
|
||||
size_t rows = rng() % MAX_ROWS + 1;
|
||||
size_t filter_ratio = filter_ratios[rng() % K];
|
||||
|
||||
test_case(rows, filter_ratio);
|
||||
}
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
FAIL() << e.displayText();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST(ColumnVector, Filter)
|
||||
{
|
||||
testFilter<UInt8>();
|
||||
testFilter<Int16>();
|
||||
testFilter<UInt32>();
|
||||
testFilter<Int64>();
|
||||
testFilter<UInt128>();
|
||||
testFilter<Int256>();
|
||||
testFilter<Float32>();
|
||||
testFilter<Float64>();
|
||||
testFilter<UUID>();
|
||||
}
|
@ -82,7 +82,6 @@ inline bool cpuid(UInt32 op, UInt32 * res) noexcept /// NOLINT
|
||||
OP(AVX512BW) \
|
||||
OP(AVX512VL) \
|
||||
OP(AVX512VBMI) \
|
||||
OP(AVX512VBMI2) \
|
||||
OP(PREFETCHWT1) \
|
||||
OP(SHA) \
|
||||
OP(ADX) \
|
||||
@ -303,11 +302,6 @@ bool haveAVX512VBMI() noexcept
|
||||
return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ecx >> 1) & 1u);
|
||||
}
|
||||
|
||||
bool haveAVX512VBMI2() noexcept
|
||||
{
|
||||
return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ecx >> 6) & 1u);
|
||||
}
|
||||
|
||||
bool haveRDRAND() noexcept
|
||||
{
|
||||
return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x1).registers.ecx >> 30) & 1u);
|
||||
|
@ -20,8 +20,6 @@ UInt32 getSupportedArchs()
|
||||
result |= static_cast<UInt32>(TargetArch::AVX512BW);
|
||||
if (Cpu::CpuFlagsCache::have_AVX512VBMI)
|
||||
result |= static_cast<UInt32>(TargetArch::AVX512VBMI);
|
||||
if (Cpu::CpuFlagsCache::have_AVX512VBMI2)
|
||||
result |= static_cast<UInt32>(TargetArch::AVX512VBMI2);
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -40,9 +38,8 @@ String toString(TargetArch arch)
|
||||
case TargetArch::AVX: return "avx";
|
||||
case TargetArch::AVX2: return "avx2";
|
||||
case TargetArch::AVX512F: return "avx512f";
|
||||
case TargetArch::AVX512BW: return "avx512bw";
|
||||
case TargetArch::AVX512VBMI: return "avx512vbmi";
|
||||
case TargetArch::AVX512VBMI2: return "avx512vbmi";
|
||||
case TargetArch::AVX512BW: return "avx512bw";
|
||||
case TargetArch::AVX512VBMI: return "avx512vbmi";
|
||||
}
|
||||
|
||||
__builtin_unreachable();
|
||||
|
@ -31,7 +31,7 @@
|
||||
* int funcImpl() {
|
||||
* return 2;
|
||||
* }
|
||||
* ) // DECLARE_AVX2_SPECIFIC_CODE
|
||||
* ) // DECLARE_DEFAULT_CODE
|
||||
*
|
||||
* int func() {
|
||||
* #if USE_MULTITARGET_CODE
|
||||
@ -80,9 +80,8 @@ enum class TargetArch : UInt32
|
||||
AVX = (1 << 1),
|
||||
AVX2 = (1 << 2),
|
||||
AVX512F = (1 << 3),
|
||||
AVX512BW = (1 << 4),
|
||||
AVX512VBMI = (1 << 5),
|
||||
AVX512VBMI2 = (1 << 6),
|
||||
AVX512BW = (1 << 4),
|
||||
AVX512VBMI = (1 << 5),
|
||||
};
|
||||
|
||||
/// Runtime detection.
|
||||
@ -101,7 +100,6 @@ String toString(TargetArch arch);
|
||||
|
||||
#if defined(__clang__)
|
||||
|
||||
#define AVX512VBMI2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2")))
|
||||
#define AVX512VBMI_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi")))
|
||||
#define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw")))
|
||||
#define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f")))
|
||||
@ -110,8 +108,6 @@ String toString(TargetArch arch);
|
||||
#define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt")))
|
||||
#define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE
|
||||
|
||||
# define BEGIN_AVX512VBMI2_SPECIFIC_CODE \
|
||||
_Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2\"))),apply_to=function)")
|
||||
# define BEGIN_AVX512VBMI_SPECIFIC_CODE \
|
||||
_Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi\"))),apply_to=function)")
|
||||
# define BEGIN_AVX512BW_SPECIFIC_CODE \
|
||||
@ -133,7 +129,6 @@ String toString(TargetArch arch);
|
||||
# define DUMMY_FUNCTION_DEFINITION [[maybe_unused]] void _dummy_function_definition();
|
||||
#else
|
||||
|
||||
#define AVX512VBMI2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2,tune=native")))
|
||||
#define AVX512VBMI_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,tune=native")))
|
||||
#define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,tune=native")))
|
||||
#define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,tune=native")))
|
||||
@ -142,9 +137,6 @@ String toString(TargetArch arch);
|
||||
#define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt",tune=native)))
|
||||
#define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE
|
||||
|
||||
# define BEGIN_AVX512VBMI2_SPECIFIC_CODE \
|
||||
_Pragma("GCC push_options") \
|
||||
_Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2,tune=native\")")
|
||||
# define BEGIN_AVX512VBMI_SPECIFIC_CODE \
|
||||
_Pragma("GCC push_options") \
|
||||
_Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,tune=native\")")
|
||||
@ -225,16 +217,6 @@ namespace TargetSpecific::AVX512VBMI { \
|
||||
} \
|
||||
END_TARGET_SPECIFIC_CODE
|
||||
|
||||
#define DECLARE_AVX512VBMI2_SPECIFIC_CODE(...) \
|
||||
BEGIN_AVX512VBMI2_SPECIFIC_CODE \
|
||||
namespace TargetSpecific::AVX512VBMI2 { \
|
||||
DUMMY_FUNCTION_DEFINITION \
|
||||
using namespace DB::TargetSpecific::AVX512VBMI2; \
|
||||
__VA_ARGS__ \
|
||||
} \
|
||||
END_TARGET_SPECIFIC_CODE
|
||||
|
||||
|
||||
#else
|
||||
|
||||
#define USE_MULTITARGET_CODE 0
|
||||
@ -247,7 +229,6 @@ END_TARGET_SPECIFIC_CODE
|
||||
#define DECLARE_AVX512F_SPECIFIC_CODE(...)
|
||||
#define DECLARE_AVX512BW_SPECIFIC_CODE(...)
|
||||
#define DECLARE_AVX512VBMI_SPECIFIC_CODE(...)
|
||||
#define DECLARE_AVX512VBMI2_SPECIFIC_CODE(...)
|
||||
|
||||
#endif
|
||||
|
||||
@ -264,9 +245,8 @@ DECLARE_SSE42_SPECIFIC_CODE (__VA_ARGS__) \
|
||||
DECLARE_AVX_SPECIFIC_CODE (__VA_ARGS__) \
|
||||
DECLARE_AVX2_SPECIFIC_CODE (__VA_ARGS__) \
|
||||
DECLARE_AVX512F_SPECIFIC_CODE(__VA_ARGS__) \
|
||||
DECLARE_AVX512BW_SPECIFIC_CODE (__VA_ARGS__) \
|
||||
DECLARE_AVX512VBMI_SPECIFIC_CODE (__VA_ARGS__) \
|
||||
DECLARE_AVX512VBMI2_SPECIFIC_CODE (__VA_ARGS__)
|
||||
DECLARE_AVX512BW_SPECIFIC_CODE(__VA_ARGS__) \
|
||||
DECLARE_AVX512VBMI_SPECIFIC_CODE(__VA_ARGS__)
|
||||
|
||||
DECLARE_DEFAULT_CODE(
|
||||
constexpr auto BuildArch = TargetArch::Default; /// NOLINT
|
||||
@ -296,9 +276,6 @@ DECLARE_AVX512VBMI_SPECIFIC_CODE(
|
||||
constexpr auto BuildArch = TargetArch::AVX512VBMI; /// NOLINT
|
||||
) // DECLARE_AVX512VBMI_SPECIFIC_CODE
|
||||
|
||||
DECLARE_AVX512VBMI2_SPECIFIC_CODE(
|
||||
constexpr auto BuildArch = TargetArch::AVX512VBMI2; /// NOLINT
|
||||
) // DECLARE_AVX512VBMI2_SPECIFIC_CODE
|
||||
|
||||
/** Runtime Dispatch helpers for class members.
|
||||
*
|
||||
|
Loading…
Reference in New Issue
Block a user