Merge pull request #41752 from ClickHouse/revert-revert-revert

Revert of "Revert the revert of "ColumnVector: optimize filter with AVX512 VBMI2 compress store" #40033"
This commit is contained in:
Alexey Milovidov 2022-09-26 15:16:02 +03:00 committed by GitHub
commit 7bb245720a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 32 additions and 332 deletions

View File

@ -12,14 +12,12 @@
#include <Common/RadixSort.h>
#include <Common/SipHash.h>
#include <Common/WeakHash.h>
#include <Common/TargetSpecific.h>
#include <Common/assert_cast.h>
#include <base/sort.h>
#include <base/unaligned.h>
#include <base/bit_cast.h>
#include <base/scope_guard.h>
#include <bit>
#include <cmath>
#include <cstring>
@ -27,10 +25,6 @@
# include <emmintrin.h>
#endif
#if USE_MULTITARGET_CODE
# include <immintrin.h>
#endif
#if USE_EMBEDDED_COMPILER
#include <DataTypes/Native.h>
#include <llvm/IR/IRBuilder.h>
@ -477,128 +471,6 @@ void ColumnVector<T>::insertRangeFrom(const IColumn & src, size_t start, size_t
memcpy(data.data() + old_size, &src_vec.data[start], length * sizeof(data[0]));
}
static inline UInt64 blsr(UInt64 mask)
{
#ifdef __BMI__
return _blsr_u64(mask);
#else
return mask & (mask-1);
#endif
}
DECLARE_DEFAULT_CODE(
template <typename T, typename Container, size_t SIMD_BYTES>
inline void doFilterAligned(const UInt8 *& filt_pos, const UInt8 *& filt_end_aligned, const T *& data_pos, Container & res_data)
{
while (filt_pos < filt_end_aligned)
{
UInt64 mask = bytes64MaskToBits64Mask(filt_pos);
if (0xffffffffffffffff == mask)
{
res_data.insert(data_pos, data_pos + SIMD_BYTES);
}
else
{
while (mask)
{
size_t index = std::countr_zero(mask);
res_data.push_back(data_pos[index]);
mask = blsr(mask);
}
}
filt_pos += SIMD_BYTES;
data_pos += SIMD_BYTES;
}
}
)
namespace
{
template <typename T, typename Container>
void resize(Container & res_data, size_t reserve_size)
{
#if defined(MEMORY_SANITIZER)
res_data.resize_fill(reserve_size, static_cast<T>(0)); // MSan doesn't recognize that all allocated memory is written by AVX-512 intrinsics.
#else
res_data.resize(reserve_size);
#endif
}
}
DECLARE_AVX512VBMI2_SPECIFIC_CODE(
template <size_t ELEMENT_WIDTH>
inline void compressStoreAVX512(const void *src, void *dst, const UInt64 mask)
{
__m512i vsrc = _mm512_loadu_si512(src);
if constexpr (ELEMENT_WIDTH == 1)
_mm512_mask_compressstoreu_epi8(dst, static_cast<__mmask64>(mask), vsrc);
else if constexpr (ELEMENT_WIDTH == 2)
_mm512_mask_compressstoreu_epi16(dst, static_cast<__mmask32>(mask), vsrc);
else if constexpr (ELEMENT_WIDTH == 4)
_mm512_mask_compressstoreu_epi32(dst, static_cast<__mmask16>(mask), vsrc);
else if constexpr (ELEMENT_WIDTH == 8)
_mm512_mask_compressstoreu_epi64(dst, static_cast<__mmask8>(mask), vsrc);
}
template <typename T, typename Container, size_t SIMD_BYTES>
inline void doFilterAligned(const UInt8 *& filt_pos, const UInt8 *& filt_end_aligned, const T *& data_pos, Container & res_data)
{
static constexpr size_t VEC_LEN = 64; /// AVX512 vector length - 64 bytes
static constexpr size_t ELEMENT_WIDTH = sizeof(T);
static constexpr size_t ELEMENTS_PER_VEC = VEC_LEN / ELEMENT_WIDTH;
static constexpr UInt64 KMASK = 0xffffffffffffffff >> (64 - ELEMENTS_PER_VEC);
size_t current_offset = res_data.size();
size_t reserve_size = res_data.size();
size_t alloc_size = SIMD_BYTES * 2;
while (filt_pos < filt_end_aligned)
{
/// to avoid calling resize too frequently, resize to reserve buffer.
if (reserve_size - current_offset < SIMD_BYTES)
{
reserve_size += alloc_size;
resize<T>(res_data, reserve_size);
alloc_size *= 2;
}
UInt64 mask = bytes64MaskToBits64Mask(filt_pos);
if (0xffffffffffffffff == mask)
{
for (size_t i = 0; i < SIMD_BYTES; i += ELEMENTS_PER_VEC)
_mm512_storeu_si512(reinterpret_cast<void *>(&res_data[current_offset + i]),
_mm512_loadu_si512(reinterpret_cast<const void *>(data_pos + i)));
current_offset += SIMD_BYTES;
}
else
{
if (mask)
{
for (size_t i = 0; i < SIMD_BYTES; i += ELEMENTS_PER_VEC)
{
compressStoreAVX512<ELEMENT_WIDTH>(reinterpret_cast<const void *>(data_pos + i),
reinterpret_cast<void *>(&res_data[current_offset]), mask & KMASK);
current_offset += std::popcount(mask & KMASK);
/// prepare mask for next iter, if ELEMENTS_PER_VEC = 64, no next iter
if (ELEMENTS_PER_VEC < 64)
{
mask >>= ELEMENTS_PER_VEC;
}
}
}
}
filt_pos += SIMD_BYTES;
data_pos += SIMD_BYTES;
}
/// resize to the real size.
res_data.resize(current_offset);
}
)
template <typename T>
ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_size_hint) const
{
@ -624,13 +496,31 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
static constexpr size_t SIMD_BYTES = 64;
const UInt8 * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
#if USE_MULTITARGET_CODE
static constexpr bool VBMI2_CAPABLE = sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8;
if (VBMI2_CAPABLE && isArchSupported(TargetArch::AVX512VBMI2))
TargetSpecific::AVX512VBMI2::doFilterAligned<T, Container, SIMD_BYTES>(filt_pos, filt_end_aligned, data_pos, res_data);
else
#endif
TargetSpecific::Default::doFilterAligned<T, Container, SIMD_BYTES>(filt_pos, filt_end_aligned, data_pos, res_data);
while (filt_pos < filt_end_aligned)
{
UInt64 mask = bytes64MaskToBits64Mask(filt_pos);
if (0xffffffffffffffff == mask)
{
res_data.insert(data_pos, data_pos + SIMD_BYTES);
}
else
{
while (mask)
{
size_t index = std::countr_zero(mask);
res_data.push_back(data_pos[index]);
#ifdef __BMI__
mask = _blsr_u64(mask);
#else
mask = mask & (mask-1);
#endif
}
}
filt_pos += SIMD_BYTES;
data_pos += SIMD_BYTES;
}
while (filt_pos < filt_end)
{

View File

@ -1,158 +0,0 @@
#include <limits>
#include <typeinfo>
#include <vector>
#include <Columns/ColumnsNumber.h>
#include <Common/randomSeed.h>
#include <gtest/gtest.h>
using namespace DB;
static pcg64 rng(randomSeed());
static constexpr int error_code = 12345;
static constexpr size_t TEST_RUNS = 500;
static constexpr size_t MAX_ROWS = 10000;
static const std::vector<size_t> filter_ratios = {1, 2, 5, 11, 32, 64, 100, 1000};
static const size_t K = filter_ratios.size();
template <typename T>
static MutableColumnPtr createColumn(size_t n)
{
auto column = ColumnVector<T>::create();
auto & values = column->getData();
for (size_t i = 0; i < n; ++i)
{
values.push_back(i);
}
return column;
}
bool checkFilter(const PaddedPODArray<UInt8> &flit, const IColumn & src, const IColumn & dst)
{
size_t n = flit.size();
size_t dst_size = dst.size();
size_t j = 0; /// index of dest
for (size_t i = 0; i < n; ++i)
{
if (flit[i] != 0)
{
if ((dst_size <= j) || (src.compareAt(i, j, dst, 0) != 0))
return false;
j++;
}
}
return dst_size == j; /// filtered size check
}
template <typename T>
static void testFilter()
{
auto test_case = [&](size_t rows, size_t filter_ratio)
{
auto vector_column = createColumn<T>(rows);
PaddedPODArray<UInt8> flit(rows);
for (size_t i = 0; i < rows; ++i)
flit[i] = rng() % filter_ratio == 0;
auto res_column = vector_column->filter(flit, -1);
if (!checkFilter(flit, *vector_column, *res_column))
throw Exception(error_code, "VectorColumn filter failure, type: {}", typeid(T).name());
};
try
{
for (size_t i = 0; i < TEST_RUNS; ++i)
{
size_t rows = rng() % MAX_ROWS + 1;
size_t filter_ratio = filter_ratios[rng() % K];
test_case(rows, filter_ratio);
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnVector, Filter)
{
testFilter<UInt8>();
testFilter<Int16>();
testFilter<UInt32>();
testFilter<Int64>();
testFilter<UInt128>();
testFilter<Int256>();
testFilter<Float32>();
testFilter<Float64>();
testFilter<UUID>();
}
template <typename T>
static MutableColumnPtr createIndexColumn(size_t limit, size_t rows)
{
auto column = ColumnVector<T>::create();
auto & values = column->getData();
auto max = std::numeric_limits<T>::max();
limit = limit > max ? max : limit;
for (size_t i = 0; i < rows; ++i)
{
T val = rng() % limit;
values.push_back(val);
}
return column;
}
template <typename T, typename IndexType>
static void testIndex()
{
static const std::vector<size_t> column_sizes = {64, 128, 196, 256, 512};
auto test_case = [&](size_t rows, size_t index_rows, size_t limit)
{
auto vector_column = createColumn<T>(rows);
auto index_column = createIndexColumn<IndexType>(rows, index_rows);
auto res_column = vector_column->index(*index_column, limit);
if (limit == 0)
limit = index_column->size();
/// check results
if (limit != res_column->size())
throw Exception(error_code, "ColumnVector index size not match to limit: {} {}", typeid(T).name(), typeid(IndexType).name());
for (size_t i = 0; i < limit; ++i)
{
/// vector_column data is the same as index, so indexed column's value will equals to index_column.
if (res_column->get64(i) != index_column->get64(i))
throw Exception(error_code, "ColumnVector index fail: {} {}", typeid(T).name(), typeid(IndexType).name());
}
};
try
{
for (size_t i = 0; i < TEST_RUNS; ++i)
{
/// make sure rows distribute in (column_sizes[r-1], colulmn_sizes[r]]
size_t row_idx = rng() % column_sizes.size();
size_t row_base = row_idx > 0 ? column_sizes[row_idx - 1] : 0;
size_t rows = row_base + (rng() % (column_sizes[row_idx] - row_base) + 1);
size_t index_rows = rng() % MAX_ROWS + 1;
test_case(rows, index_rows, 0);
test_case(rows, index_rows, static_cast<size_t>(0.5 * index_rows));
}
}
catch (const Exception & e)
{
FAIL() << e.displayText();
}
}
TEST(ColumnVector, Index)
{
testIndex<UInt8, UInt8>();
testIndex<UInt16, UInt8>();
testIndex<UInt16, UInt16>();
}

View File

@ -82,7 +82,6 @@ inline bool cpuid(UInt32 op, UInt32 * res) noexcept /// NOLINT
OP(AVX512BW) \
OP(AVX512VL) \
OP(AVX512VBMI) \
OP(AVX512VBMI2) \
OP(PREFETCHWT1) \
OP(SHA) \
OP(ADX) \
@ -303,11 +302,6 @@ bool haveAVX512VBMI() noexcept
return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ecx >> 1) & 1u);
}
bool haveAVX512VBMI2() noexcept
{
return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ecx >> 6) & 1u);
}
bool haveRDRAND() noexcept
{
return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x1).registers.ecx >> 30) & 1u);

View File

@ -20,8 +20,6 @@ UInt32 getSupportedArchs()
result |= static_cast<UInt32>(TargetArch::AVX512BW);
if (Cpu::CpuFlagsCache::have_AVX512VBMI)
result |= static_cast<UInt32>(TargetArch::AVX512VBMI);
if (Cpu::CpuFlagsCache::have_AVX512VBMI2)
result |= static_cast<UInt32>(TargetArch::AVX512VBMI2);
return result;
}
@ -40,9 +38,8 @@ String toString(TargetArch arch)
case TargetArch::AVX: return "avx";
case TargetArch::AVX2: return "avx2";
case TargetArch::AVX512F: return "avx512f";
case TargetArch::AVX512BW: return "avx512bw";
case TargetArch::AVX512VBMI: return "avx512vbmi";
case TargetArch::AVX512VBMI2: return "avx512vbmi";
case TargetArch::AVX512BW: return "avx512bw";
case TargetArch::AVX512VBMI: return "avx512vbmi";
}
__builtin_unreachable();

View File

@ -31,7 +31,7 @@
* int funcImpl() {
* return 2;
* }
* ) // DECLARE_AVX2_SPECIFIC_CODE
* ) // DECLARE_DEFAULT_CODE
*
* int func() {
* #if USE_MULTITARGET_CODE
@ -80,9 +80,8 @@ enum class TargetArch : UInt32
AVX = (1 << 1),
AVX2 = (1 << 2),
AVX512F = (1 << 3),
AVX512BW = (1 << 4),
AVX512VBMI = (1 << 5),
AVX512VBMI2 = (1 << 6),
AVX512BW = (1 << 4),
AVX512VBMI = (1 << 5),
};
/// Runtime detection.
@ -101,7 +100,6 @@ String toString(TargetArch arch);
#if defined(__clang__)
#define AVX512VBMI2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2")))
#define AVX512VBMI_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi")))
#define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw")))
#define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f")))
@ -110,8 +108,6 @@ String toString(TargetArch arch);
#define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt")))
#define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE
# define BEGIN_AVX512VBMI2_SPECIFIC_CODE \
_Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2\"))),apply_to=function)")
# define BEGIN_AVX512VBMI_SPECIFIC_CODE \
_Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi\"))),apply_to=function)")
# define BEGIN_AVX512BW_SPECIFIC_CODE \
@ -133,7 +129,6 @@ String toString(TargetArch arch);
# define DUMMY_FUNCTION_DEFINITION [[maybe_unused]] void _dummy_function_definition();
#else
#define AVX512VBMI2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2,tune=native")))
#define AVX512VBMI_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,tune=native")))
#define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,tune=native")))
#define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,tune=native")))
@ -142,9 +137,6 @@ String toString(TargetArch arch);
#define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt",tune=native)))
#define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE
# define BEGIN_AVX512VBMI2_SPECIFIC_CODE \
_Pragma("GCC push_options") \
_Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2,tune=native\")")
# define BEGIN_AVX512VBMI_SPECIFIC_CODE \
_Pragma("GCC push_options") \
_Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,tune=native\")")
@ -225,16 +217,6 @@ namespace TargetSpecific::AVX512VBMI { \
} \
END_TARGET_SPECIFIC_CODE
#define DECLARE_AVX512VBMI2_SPECIFIC_CODE(...) \
BEGIN_AVX512VBMI2_SPECIFIC_CODE \
namespace TargetSpecific::AVX512VBMI2 { \
DUMMY_FUNCTION_DEFINITION \
using namespace DB::TargetSpecific::AVX512VBMI2; \
__VA_ARGS__ \
} \
END_TARGET_SPECIFIC_CODE
#else
#define USE_MULTITARGET_CODE 0
@ -247,7 +229,6 @@ END_TARGET_SPECIFIC_CODE
#define DECLARE_AVX512F_SPECIFIC_CODE(...)
#define DECLARE_AVX512BW_SPECIFIC_CODE(...)
#define DECLARE_AVX512VBMI_SPECIFIC_CODE(...)
#define DECLARE_AVX512VBMI2_SPECIFIC_CODE(...)
#endif
@ -264,9 +245,8 @@ DECLARE_SSE42_SPECIFIC_CODE (__VA_ARGS__) \
DECLARE_AVX_SPECIFIC_CODE (__VA_ARGS__) \
DECLARE_AVX2_SPECIFIC_CODE (__VA_ARGS__) \
DECLARE_AVX512F_SPECIFIC_CODE(__VA_ARGS__) \
DECLARE_AVX512BW_SPECIFIC_CODE (__VA_ARGS__) \
DECLARE_AVX512VBMI_SPECIFIC_CODE (__VA_ARGS__) \
DECLARE_AVX512VBMI2_SPECIFIC_CODE (__VA_ARGS__)
DECLARE_AVX512BW_SPECIFIC_CODE(__VA_ARGS__) \
DECLARE_AVX512VBMI_SPECIFIC_CODE(__VA_ARGS__)
DECLARE_DEFAULT_CODE(
constexpr auto BuildArch = TargetArch::Default; /// NOLINT
@ -296,9 +276,6 @@ DECLARE_AVX512VBMI_SPECIFIC_CODE(
constexpr auto BuildArch = TargetArch::AVX512VBMI; /// NOLINT
) // DECLARE_AVX512VBMI_SPECIFIC_CODE
DECLARE_AVX512VBMI2_SPECIFIC_CODE(
constexpr auto BuildArch = TargetArch::AVX512VBMI2; /// NOLINT
) // DECLARE_AVX512VBMI2_SPECIFIC_CODE
/** Runtime Dispatch helpers for class members.
*