mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 00:30:49 +00:00
add x86 feature avx2/avx512 support for filter implementation
This commit is contained in:
parent
0adbcd58c9
commit
a3d629a5b5
@ -18,6 +18,8 @@ option (ENABLE_PCLMULQDQ "Use pclmulqdq instructions on x86_64" 1)
|
||||
option (ENABLE_POPCNT "Use popcnt instructions on x86_64" 1)
|
||||
option (ENABLE_AVX "Use AVX instructions on x86_64" 0)
|
||||
option (ENABLE_AVX2 "Use AVX2 instructions on x86_64" 0)
|
||||
option (ENABLE_AVX512 "Use AVX512 instructions on x86_64" 1)
|
||||
option (ENABLE_BMI "Use BMI instructions on x86_64" 1)
|
||||
|
||||
option (ARCH_NATIVE "Add -march=native compiler flag. This makes your binaries non-portable but more performant code may be generated. This option overrides ENABLE_* options for specific instruction set. Highly not recommended to use." 0)
|
||||
|
||||
@ -127,6 +129,36 @@ else ()
|
||||
if (HAVE_AVX2 AND ENABLE_AVX2)
|
||||
set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}")
|
||||
endif ()
|
||||
|
||||
set (TEST_FLAG "-mavx512f -mavx512bw")
|
||||
set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG} -O0")
|
||||
check_cxx_source_compiles("
|
||||
#include <immintrin.h>
|
||||
int main() {
|
||||
auto a = _mm512_setzero_epi32();
|
||||
(void)a;
|
||||
auto b = _mm512_add_epi16(__m512i(), __m512i());
|
||||
(void)b;
|
||||
return 0;
|
||||
}
|
||||
" HAVE_AVX512)
|
||||
if (HAVE_AVX512 AND ENABLE_AVX512)
|
||||
set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}")
|
||||
endif ()
|
||||
|
||||
set (TEST_FLAG "-mbmi")
|
||||
set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG} -O0")
|
||||
check_cxx_source_compiles("
|
||||
#include <immintrin.h>
|
||||
int main() {
|
||||
auto a = _blsr_u32(0);
|
||||
(void)a;
|
||||
return 0;
|
||||
}
|
||||
" HAVE_BMI)
|
||||
if (HAVE_BMI AND ENABLE_BMI)
|
||||
set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
cmake_pop_check_state ()
|
||||
|
@ -230,8 +230,74 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
|
||||
const UInt8 * filt_pos = filt.data();
|
||||
const UInt8 * filt_end = filt_pos + col_size;
|
||||
const UInt8 * data_pos = chars.data();
|
||||
#if defined(__AVX512F__) && defined(__AVX512BW__)
|
||||
static constexpr size_t SIMD_BYTES = 64;
|
||||
const __m512i zero64 = _mm512_setzero_epi32();
|
||||
const UInt8 * filt_end_avx512 = filt_pos + col_size / SIMD_BYTES * SIMD_BYTES;
|
||||
const size_t chars_per_simd_elements = SIMD_BYTES * n;
|
||||
|
||||
#ifdef __SSE2__
|
||||
while (filt_pos < filt_end_avx512)
|
||||
{
|
||||
uint64_t mask = _mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(filt_pos)), zero64, _MM_CMPINT_GT);
|
||||
|
||||
if (0xFFFFFFFFFFFFFFFF == mask)
|
||||
{
|
||||
res->chars.insert(data_pos, data_pos + chars_per_simd_elements);
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t res_chars_size = res->chars.size();
|
||||
while (mask)
|
||||
{
|
||||
size_t index = __builtin_ctzll(mask);
|
||||
res->chars.resize(res_chars_size + n);
|
||||
memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos + index * n, n);
|
||||
res_chars_size += n;
|
||||
#ifdef __BMI__
|
||||
mask = _blsr_u64(mask);
|
||||
#else
|
||||
mask = mask & (mask-1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
data_pos += chars_per_simd_elements;
|
||||
filt_pos += SIMD_BYTES;
|
||||
}
|
||||
#elif defined(__AVX2__)
|
||||
static constexpr size_t SIMD_BYTES = 32;
|
||||
const __m256i zero32 = _mm256_setzero_si256();
|
||||
const UInt8 * filt_end_avx2 = filt_pos + col_size / SIMD_BYTES * SIMD_BYTES;
|
||||
const size_t chars_per_simd_elements = SIMD_BYTES * n;
|
||||
|
||||
while (filt_pos < filt_end_avx2)
|
||||
{
|
||||
uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(filt_pos)), zero32));
|
||||
|
||||
if (0xFFFFFFFF == mask)
|
||||
{
|
||||
res->chars.insert(data_pos, data_pos + chars_per_simd_elements);
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t res_chars_size = res->chars.size();
|
||||
while (mask)
|
||||
{
|
||||
size_t index = __builtin_ctz(mask);
|
||||
res->chars.resize(res_chars_size + n);
|
||||
memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos + index * n, n);
|
||||
res_chars_size += n;
|
||||
#ifdef __BMI__
|
||||
mask = _blsr_u32(mask);
|
||||
#else
|
||||
mask = mask & (mask-1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
data_pos += chars_per_simd_elements;
|
||||
filt_pos += SIMD_BYTES;
|
||||
}
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
/** A slightly more optimized version.
|
||||
* Based on the assumption that often pieces of consecutive values
|
||||
* completely pass or do not pass the filter.
|
||||
@ -267,6 +333,7 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
|
||||
data_pos += chars_per_simd_elements;
|
||||
filt_pos += SIMD_BYTES;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
size_t res_chars_size = res->chars.size();
|
||||
|
@ -311,7 +311,67 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
|
||||
const UInt8 * filt_end = filt_pos + size;
|
||||
const T * data_pos = data.data();
|
||||
|
||||
#ifdef __SSE2__
|
||||
#if defined(__AVX512F__) && defined(__AVX512BW__)
|
||||
static constexpr size_t SIMD_BYTES = 64;
|
||||
const __m512i zero64 = _mm512_setzero_epi32();
|
||||
const UInt8 * filt_end_avx512 = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
|
||||
while (filt_pos < filt_end_avx512)
|
||||
{
|
||||
UInt64 mask = _mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(filt_pos)), zero64, _MM_CMPINT_GT);
|
||||
|
||||
if (0xFFFFFFFFFFFFFFFF == mask)
|
||||
{
|
||||
res_data.insert(data_pos, data_pos + SIMD_BYTES);
|
||||
}
|
||||
else
|
||||
{
|
||||
while (mask)
|
||||
{
|
||||
size_t index = __builtin_ctzll(mask);
|
||||
res_data.push_back(data_pos[index]);
|
||||
#ifdef __BMI__
|
||||
mask = _blsr_u64(mask);
|
||||
#else
|
||||
mask = mask & (mask-1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
filt_pos += SIMD_BYTES;
|
||||
data_pos += SIMD_BYTES;
|
||||
}
|
||||
#elif defined(__AVX2__)
|
||||
static constexpr size_t SIMD_BYTES = 32;
|
||||
const __m256i zero32 = _mm256_setzero_si256();
|
||||
const UInt8 * filt_end_avx2 = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
|
||||
while (filt_pos < filt_end_avx2)
|
||||
{
|
||||
UInt32 mask = _mm256_movemask_epi8(_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(filt_pos)), zero32));
|
||||
|
||||
if (0xFFFFFFFF == mask)
|
||||
{
|
||||
res_data.insert(data_pos, data_pos + SIMD_BYTES);
|
||||
}
|
||||
else
|
||||
{
|
||||
while (mask)
|
||||
{
|
||||
size_t index = __builtin_ctz(mask);
|
||||
res_data.push_back(data_pos[index]);
|
||||
#ifdef __BMI__
|
||||
mask = _blsr_u32(mask);
|
||||
#else
|
||||
mask = mask & (mask-1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
filt_pos += SIMD_BYTES;
|
||||
data_pos += SIMD_BYTES;
|
||||
}
|
||||
#elif defined(__SSE2__)
|
||||
/** A slightly more optimized version.
|
||||
* Based on the assumption that often pieces of consecutive values
|
||||
* completely pass or do not pass the filter.
|
||||
@ -344,6 +404,7 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
|
||||
filt_pos += SIMD_BYTES;
|
||||
data_pos += SIMD_BYTES;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
while (filt_pos < filt_end)
|
||||
|
@ -229,7 +229,89 @@ namespace
|
||||
memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T));
|
||||
};
|
||||
|
||||
#ifdef __SSE2__
|
||||
#if defined(__AVX512F__) && defined(__AVX512BW__)
|
||||
const __m512i zero_vec = _mm512_setzero_epi32();
|
||||
static constexpr size_t SIMD_BYTES = 64;
|
||||
const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
|
||||
while (filt_pos < filt_end_aligned)
|
||||
{
|
||||
uint64_t mask = _mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(filt_pos)), zero_vec, _MM_CMPINT_GT);
|
||||
|
||||
if (mask == 0xffffffffffffffff)
|
||||
{
|
||||
/// SIMD_BYTES consecutive rows pass the filter
|
||||
const auto first = offsets_pos == offsets_begin;
|
||||
|
||||
const auto chunk_offset = first ? 0 : offsets_pos[-1];
|
||||
const auto chunk_size = offsets_pos[SIMD_BYTES - 1] - chunk_offset;
|
||||
|
||||
result_offsets_builder.template insertChunk<SIMD_BYTES>(offsets_pos, first, chunk_offset, chunk_size);
|
||||
|
||||
/// copy elements for SIMD_BYTES arrays at once
|
||||
const auto elems_size_old = res_elems.size();
|
||||
res_elems.resize(elems_size_old + chunk_size);
|
||||
memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T));
|
||||
}
|
||||
else
|
||||
{
|
||||
while (mask)
|
||||
{
|
||||
size_t index = __builtin_ctzll(mask);
|
||||
copy_array(offsets_pos + index);
|
||||
#ifdef __BMI__
|
||||
mask = _blsr_u64(mask);
|
||||
#else
|
||||
mask = mask & (mask-1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
filt_pos += SIMD_BYTES;
|
||||
offsets_pos += SIMD_BYTES;
|
||||
}
|
||||
#elif defined(__AVX2__)
|
||||
const __m256i zero_vec = _mm256_setzero_si256();
|
||||
static constexpr size_t SIMD_BYTES = 32;
|
||||
const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
|
||||
while (filt_pos < filt_end_aligned)
|
||||
{
|
||||
uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(filt_pos)), zero_vec));
|
||||
|
||||
if (mask == 0xffffffff)
|
||||
{
|
||||
/// SIMD_BYTES consecutive rows pass the filter
|
||||
const auto first = offsets_pos == offsets_begin;
|
||||
|
||||
const auto chunk_offset = first ? 0 : offsets_pos[-1];
|
||||
const auto chunk_size = offsets_pos[SIMD_BYTES - 1] - chunk_offset;
|
||||
|
||||
result_offsets_builder.template insertChunk<SIMD_BYTES>(offsets_pos, first, chunk_offset, chunk_size);
|
||||
|
||||
/// copy elements for SIMD_BYTES arrays at once
|
||||
const auto elems_size_old = res_elems.size();
|
||||
res_elems.resize(elems_size_old + chunk_size);
|
||||
memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T));
|
||||
}
|
||||
else
|
||||
{
|
||||
while (mask)
|
||||
{
|
||||
size_t index = __builtin_ctz(mask);
|
||||
copy_array(offsets_pos + index);
|
||||
#ifdef __BMI__
|
||||
mask = _blsr_u32(mask);
|
||||
#else
|
||||
mask = mask & (mask-1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
filt_pos += SIMD_BYTES;
|
||||
offsets_pos += SIMD_BYTES;
|
||||
}
|
||||
#elif defined(__SSE2__)
|
||||
const __m128i zero_vec = _mm_setzero_si128();
|
||||
static constexpr size_t SIMD_BYTES = 16;
|
||||
const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
@ -268,7 +350,7 @@ namespace
|
||||
|
||||
filt_pos += SIMD_BYTES;
|
||||
offsets_pos += SIMD_BYTES;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
while (filt_pos < filt_end)
|
||||
|
Loading…
Reference in New Issue
Block a user