add x86 feature avx2/avx512 support for filter implementation

This commit is contained in:
jasperzhu 2021-10-11 22:51:13 +05:30
parent 0adbcd58c9
commit a3d629a5b5
4 changed files with 246 additions and 4 deletions

View File

@ -18,6 +18,8 @@ option (ENABLE_PCLMULQDQ "Use pclmulqdq instructions on x86_64" 1)
option (ENABLE_POPCNT "Use popcnt instructions on x86_64" 1)
option (ENABLE_AVX "Use AVX instructions on x86_64" 0)
option (ENABLE_AVX2 "Use AVX2 instructions on x86_64" 0)
option (ENABLE_AVX512 "Use AVX512 instructions on x86_64" 1)
option (ENABLE_BMI "Use BMI instructions on x86_64" 1)
option (ARCH_NATIVE "Add -march=native compiler flag. This makes your binaries non-portable but more performant code may be generated. This option overrides ENABLE_* options for specific instruction set. Highly not recommended to use." 0)
@ -127,6 +129,36 @@ else ()
if (HAVE_AVX2 AND ENABLE_AVX2)
set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}")
endif ()
set (TEST_FLAG "-mavx512f -mavx512bw")
set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG} -O0")
check_cxx_source_compiles("
#include <immintrin.h>
int main() {
auto a = _mm512_setzero_epi32();
(void)a;
auto b = _mm512_add_epi16(__m512i(), __m512i());
(void)b;
return 0;
}
" HAVE_AVX512)
if (HAVE_AVX512 AND ENABLE_AVX512)
set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}")
endif ()
set (TEST_FLAG "-mbmi")
set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG} -O0")
check_cxx_source_compiles("
#include <immintrin.h>
int main() {
auto a = _blsr_u32(0);
(void)a;
return 0;
}
" HAVE_BMI)
if (HAVE_BMI AND ENABLE_BMI)
set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}")
endif ()
endif ()
cmake_pop_check_state ()

View File

@ -230,8 +230,74 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
const UInt8 * filt_pos = filt.data();
const UInt8 * filt_end = filt_pos + col_size;
const UInt8 * data_pos = chars.data();
#if defined(__AVX512F__) && defined(__AVX512BW__)
static constexpr size_t SIMD_BYTES = 64;
const __m512i zero64 = _mm512_setzero_epi32();
const UInt8 * filt_end_avx512 = filt_pos + col_size / SIMD_BYTES * SIMD_BYTES;
const size_t chars_per_simd_elements = SIMD_BYTES * n;
#ifdef __SSE2__
while (filt_pos < filt_end_avx512)
{
uint64_t mask = _mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(filt_pos)), zero64, _MM_CMPINT_GT);
if (0xFFFFFFFFFFFFFFFF == mask)
{
res->chars.insert(data_pos, data_pos + chars_per_simd_elements);
}
else
{
size_t res_chars_size = res->chars.size();
while (mask)
{
size_t index = __builtin_ctzll(mask);
res->chars.resize(res_chars_size + n);
memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos + index * n, n);
res_chars_size += n;
#ifdef __BMI__
mask = _blsr_u64(mask);
#else
mask = mask & (mask-1);
#endif
}
}
data_pos += chars_per_simd_elements;
filt_pos += SIMD_BYTES;
}
#elif defined(__AVX2__)
static constexpr size_t SIMD_BYTES = 32;
const __m256i zero32 = _mm256_setzero_si256();
const UInt8 * filt_end_avx2 = filt_pos + col_size / SIMD_BYTES * SIMD_BYTES;
const size_t chars_per_simd_elements = SIMD_BYTES * n;
while (filt_pos < filt_end_avx2)
{
uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(filt_pos)), zero32));
if (0xFFFFFFFF == mask)
{
res->chars.insert(data_pos, data_pos + chars_per_simd_elements);
}
else
{
size_t res_chars_size = res->chars.size();
while (mask)
{
size_t index = __builtin_ctz(mask);
res->chars.resize(res_chars_size + n);
memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos + index * n, n);
res_chars_size += n;
#ifdef __BMI__
mask = _blsr_u32(mask);
#else
mask = mask & (mask-1);
#endif
}
}
data_pos += chars_per_simd_elements;
filt_pos += SIMD_BYTES;
}
#elif defined(__SSE2__)
/** A slightly more optimized version.
* Based on the assumption that often pieces of consecutive values
* completely pass or do not pass the filter.
@ -267,6 +333,7 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
data_pos += chars_per_simd_elements;
filt_pos += SIMD_BYTES;
}
#endif
size_t res_chars_size = res->chars.size();

View File

@ -311,7 +311,67 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
const UInt8 * filt_end = filt_pos + size;
const T * data_pos = data.data();
#ifdef __SSE2__
#if defined(__AVX512F__) && defined(__AVX512BW__)
static constexpr size_t SIMD_BYTES = 64;
const __m512i zero64 = _mm512_setzero_epi32();
const UInt8 * filt_end_avx512 = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filt_pos < filt_end_avx512)
{
UInt64 mask = _mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(filt_pos)), zero64, _MM_CMPINT_GT);
if (0xFFFFFFFFFFFFFFFF == mask)
{
res_data.insert(data_pos, data_pos + SIMD_BYTES);
}
else
{
while (mask)
{
size_t index = __builtin_ctzll(mask);
res_data.push_back(data_pos[index]);
#ifdef __BMI__
mask = _blsr_u64(mask);
#else
mask = mask & (mask-1);
#endif
}
}
filt_pos += SIMD_BYTES;
data_pos += SIMD_BYTES;
}
#elif defined(__AVX2__)
static constexpr size_t SIMD_BYTES = 32;
const __m256i zero32 = _mm256_setzero_si256();
const UInt8 * filt_end_avx2 = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filt_pos < filt_end_avx2)
{
UInt32 mask = _mm256_movemask_epi8(_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(filt_pos)), zero32));
if (0xFFFFFFFF == mask)
{
res_data.insert(data_pos, data_pos + SIMD_BYTES);
}
else
{
while (mask)
{
size_t index = __builtin_ctz(mask);
res_data.push_back(data_pos[index]);
#ifdef __BMI__
mask = _blsr_u32(mask);
#else
mask = mask & (mask-1);
#endif
}
}
filt_pos += SIMD_BYTES;
data_pos += SIMD_BYTES;
}
#elif defined(__SSE2__)
/** A slightly more optimized version.
* Based on the assumption that often pieces of consecutive values
* completely pass or do not pass the filter.
@ -344,6 +404,7 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
filt_pos += SIMD_BYTES;
data_pos += SIMD_BYTES;
}
#endif
while (filt_pos < filt_end)

View File

@ -229,7 +229,89 @@ namespace
memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T));
};
#ifdef __SSE2__
#if defined(__AVX512F__) && defined(__AVX512BW__)
const __m512i zero_vec = _mm512_setzero_epi32();
static constexpr size_t SIMD_BYTES = 64;
const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filt_pos < filt_end_aligned)
{
uint64_t mask = _mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(filt_pos)), zero_vec, _MM_CMPINT_GT);
if (mask == 0xffffffffffffffff)
{
/// SIMD_BYTES consecutive rows pass the filter
const auto first = offsets_pos == offsets_begin;
const auto chunk_offset = first ? 0 : offsets_pos[-1];
const auto chunk_size = offsets_pos[SIMD_BYTES - 1] - chunk_offset;
result_offsets_builder.template insertChunk<SIMD_BYTES>(offsets_pos, first, chunk_offset, chunk_size);
/// copy elements for SIMD_BYTES arrays at once
const auto elems_size_old = res_elems.size();
res_elems.resize(elems_size_old + chunk_size);
memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T));
}
else
{
while (mask)
{
size_t index = __builtin_ctzll(mask);
copy_array(offsets_pos + index);
#ifdef __BMI__
mask = _blsr_u64(mask);
#else
mask = mask & (mask-1);
#endif
}
}
filt_pos += SIMD_BYTES;
offsets_pos += SIMD_BYTES;
}
#elif defined(__AVX2__)
const __m256i zero_vec = _mm256_setzero_si256();
static constexpr size_t SIMD_BYTES = 32;
const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filt_pos < filt_end_aligned)
{
uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(filt_pos)), zero_vec));
if (mask == 0xffffffff)
{
/// SIMD_BYTES consecutive rows pass the filter
const auto first = offsets_pos == offsets_begin;
const auto chunk_offset = first ? 0 : offsets_pos[-1];
const auto chunk_size = offsets_pos[SIMD_BYTES - 1] - chunk_offset;
result_offsets_builder.template insertChunk<SIMD_BYTES>(offsets_pos, first, chunk_offset, chunk_size);
/// copy elements for SIMD_BYTES arrays at once
const auto elems_size_old = res_elems.size();
res_elems.resize(elems_size_old + chunk_size);
memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T));
}
else
{
while (mask)
{
size_t index = __builtin_ctz(mask);
copy_array(offsets_pos + index);
#ifdef __BMI__
mask = _blsr_u32(mask);
#else
mask = mask & (mask-1);
#endif
}
}
filt_pos += SIMD_BYTES;
offsets_pos += SIMD_BYTES;
}
#elif defined(__SSE2__)
const __m128i zero_vec = _mm_setzero_si128();
static constexpr size_t SIMD_BYTES = 16;
const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
@ -268,7 +350,7 @@ namespace
filt_pos += SIMD_BYTES;
offsets_pos += SIMD_BYTES;
}
}
#endif
while (filt_pos < filt_end)