From a3d629a5b541ef2d0489b9b7e7e710ed3c7a0410 Mon Sep 17 00:00:00 2001 From: jasperzhu Date: Mon, 11 Oct 2021 22:51:13 +0530 Subject: [PATCH] add x86 feature avx2/avx512 support for filter implementation --- cmake/cpu_features.cmake | 32 ++++++++++++ src/Columns/ColumnFixedString.cpp | 69 ++++++++++++++++++++++++- src/Columns/ColumnVector.cpp | 63 +++++++++++++++++++++- src/Columns/ColumnsCommon.cpp | 86 ++++++++++++++++++++++++++++++- 4 files changed, 246 insertions(+), 4 deletions(-) diff --git a/cmake/cpu_features.cmake b/cmake/cpu_features.cmake index 46e42329958..d77ca0b32e3 100644 --- a/cmake/cpu_features.cmake +++ b/cmake/cpu_features.cmake @@ -18,6 +18,8 @@ option (ENABLE_PCLMULQDQ "Use pclmulqdq instructions on x86_64" 1) option (ENABLE_POPCNT "Use popcnt instructions on x86_64" 1) option (ENABLE_AVX "Use AVX instructions on x86_64" 0) option (ENABLE_AVX2 "Use AVX2 instructions on x86_64" 0) +option (ENABLE_AVX512 "Use AVX512 instructions on x86_64" 1) +option (ENABLE_BMI "Use BMI instructions on x86_64" 1) option (ARCH_NATIVE "Add -march=native compiler flag. This makes your binaries non-portable but more performant code may be generated. This option overrides ENABLE_* options for specific instruction set. Highly not recommended to use." 0) @@ -127,6 +129,36 @@ else () if (HAVE_AVX2 AND ENABLE_AVX2) set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}") endif () + + set (TEST_FLAG "-mavx512f -mavx512bw") + set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG} -O0") + check_cxx_source_compiles(" + #include + int main() { + auto a = _mm512_setzero_epi32(); + (void)a; + auto b = _mm512_add_epi16(__m512i(), __m512i()); + (void)b; + return 0; + } + " HAVE_AVX512) + if (HAVE_AVX512 AND ENABLE_AVX512) + set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}") + endif () + + set (TEST_FLAG "-mbmi") + set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG} -O0") + check_cxx_source_compiles(" + #include + int main() { + auto a = _blsr_u32(0); + (void)a; + return 0; + } + " HAVE_BMI) + if (HAVE_BMI AND ENABLE_BMI) + set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}") + endif () endif () cmake_pop_check_state () diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index 94127fa8eb3..9daec1c1c64 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -230,8 +230,74 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result const UInt8 * filt_pos = filt.data(); const UInt8 * filt_end = filt_pos + col_size; const UInt8 * data_pos = chars.data(); +#if defined(__AVX512F__) && defined(__AVX512BW__) + static constexpr size_t SIMD_BYTES = 64; + const __m512i zero64 = _mm512_setzero_epi32(); + const UInt8 * filt_end_avx512 = filt_pos + col_size / SIMD_BYTES * SIMD_BYTES; + const size_t chars_per_simd_elements = SIMD_BYTES * n; -#ifdef __SSE2__ + while (filt_pos < filt_end_avx512) + { + uint64_t mask = _mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast(filt_pos)), zero64, _MM_CMPINT_GT); + + if (0xFFFFFFFFFFFFFFFF == mask) + { + res->chars.insert(data_pos, data_pos + chars_per_simd_elements); + } + else + { + size_t res_chars_size = res->chars.size(); + while (mask) + { + size_t index = __builtin_ctzll(mask); + res->chars.resize(res_chars_size + n); + memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos + index * n, n); + res_chars_size += n; + #ifdef __BMI__ + mask = _blsr_u64(mask); + #else + mask = mask & (mask-1); + #endif + } + } + data_pos += chars_per_simd_elements; + filt_pos += SIMD_BYTES; + } +#elif defined(__AVX2__) + static constexpr size_t SIMD_BYTES = 32; + const __m256i zero32 = _mm256_setzero_si256(); + const UInt8 * filt_end_avx2 = filt_pos + col_size / SIMD_BYTES * SIMD_BYTES; + const size_t chars_per_simd_elements = SIMD_BYTES * n; + + while (filt_pos < filt_end_avx2) + { + uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast(filt_pos)), zero32)); + + if (0xFFFFFFFF == mask) + { + res->chars.insert(data_pos, data_pos + chars_per_simd_elements); + } + else + { + size_t res_chars_size = res->chars.size(); + while (mask) + { + size_t index = __builtin_ctz(mask); + res->chars.resize(res_chars_size + n); + memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos + index * n, n); + res_chars_size += n; + #ifdef __BMI__ + mask = _blsr_u32(mask); + #else + mask = mask & (mask-1); + #endif + } + } + data_pos += chars_per_simd_elements; + filt_pos += SIMD_BYTES; + } + +#elif defined(__SSE2__) /** A slightly more optimized version. * Based on the assumption that often pieces of consecutive values * completely pass or do not pass the filter. @@ -267,6 +333,7 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result data_pos += chars_per_simd_elements; filt_pos += SIMD_BYTES; } + #endif size_t res_chars_size = res->chars.size(); diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 3ee692a3ff4..000a7198446 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -311,7 +311,67 @@ ColumnPtr ColumnVector::filter(const IColumn::Filter & filt, ssize_t result_s const UInt8 * filt_end = filt_pos + size; const T * data_pos = data.data(); -#ifdef __SSE2__ +#if defined(__AVX512F__) && defined(__AVX512BW__) + static constexpr size_t SIMD_BYTES = 64; + const __m512i zero64 = _mm512_setzero_epi32(); + const UInt8 * filt_end_avx512 = filt_pos + size / SIMD_BYTES * SIMD_BYTES; + + while (filt_pos < filt_end_avx512) + { + UInt64 mask = _mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast(filt_pos)), zero64, _MM_CMPINT_GT); + + if (0xFFFFFFFFFFFFFFFF == mask) + { + res_data.insert(data_pos, data_pos + SIMD_BYTES); + } + else + { + while (mask) + { + size_t index = __builtin_ctzll(mask); + res_data.push_back(data_pos[index]); + #ifdef __BMI__ + mask = _blsr_u64(mask); + #else + mask = mask & (mask-1); + #endif + } + } + + filt_pos += SIMD_BYTES; + data_pos += SIMD_BYTES; + } +#elif defined(__AVX2__) + static constexpr size_t SIMD_BYTES = 32; + const __m256i zero32 = _mm256_setzero_si256(); + const UInt8 * filt_end_avx2 = filt_pos + size / SIMD_BYTES * SIMD_BYTES; + + while (filt_pos < filt_end_avx2) + { + UInt32 mask = _mm256_movemask_epi8(_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast(filt_pos)), zero32)); + + if (0xFFFFFFFF == mask) + { + res_data.insert(data_pos, data_pos + SIMD_BYTES); + } + else + { + while (mask) + { + size_t index = __builtin_ctz(mask); + res_data.push_back(data_pos[index]); + #ifdef __BMI__ + mask = _blsr_u32(mask); + #else + mask = mask & (mask-1); + #endif + } + } + + filt_pos += SIMD_BYTES; + data_pos += SIMD_BYTES; + } +#elif defined(__SSE2__) /** A slightly more optimized version. * Based on the assumption that often pieces of consecutive values * completely pass or do not pass the filter. @@ -344,6 +404,7 @@ ColumnPtr ColumnVector::filter(const IColumn::Filter & filt, ssize_t result_s filt_pos += SIMD_BYTES; data_pos += SIMD_BYTES; } + #endif while (filt_pos < filt_end) diff --git a/src/Columns/ColumnsCommon.cpp b/src/Columns/ColumnsCommon.cpp index a4d7de34382..36c292b4196 100644 --- a/src/Columns/ColumnsCommon.cpp +++ b/src/Columns/ColumnsCommon.cpp @@ -229,7 +229,89 @@ namespace memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T)); }; - #ifdef __SSE2__ + #if defined(__AVX512F__) && defined(__AVX512BW__) + const __m512i zero_vec = _mm512_setzero_epi32(); + static constexpr size_t SIMD_BYTES = 64; + const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES; + + while (filt_pos < filt_end_aligned) + { + uint64_t mask = _mm512_cmp_epi8_mask(_mm512_loadu_si512(reinterpret_cast(filt_pos)), zero_vec, _MM_CMPINT_GT); + + if (mask == 0xffffffffffffffff) + { + /// SIMD_BYTES consecutive rows pass the filter + const auto first = offsets_pos == offsets_begin; + + const auto chunk_offset = first ? 0 : offsets_pos[-1]; + const auto chunk_size = offsets_pos[SIMD_BYTES - 1] - chunk_offset; + + result_offsets_builder.template insertChunk(offsets_pos, first, chunk_offset, chunk_size); + + /// copy elements for SIMD_BYTES arrays at once + const auto elems_size_old = res_elems.size(); + res_elems.resize(elems_size_old + chunk_size); + memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T)); + } + else + { + while (mask) + { + size_t index = __builtin_ctzll(mask); + copy_array(offsets_pos + index); + #ifdef __BMI__ + mask = _blsr_u64(mask); + #else + mask = mask & (mask-1); + #endif + } + } + + filt_pos += SIMD_BYTES; + offsets_pos += SIMD_BYTES; + } + #elif defined(__AVX2__) + const __m256i zero_vec = _mm256_setzero_si256(); + static constexpr size_t SIMD_BYTES = 32; + const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES; + + while (filt_pos < filt_end_aligned) + { + uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast(filt_pos)), zero_vec)); + + if (mask == 0xffffffff) + { + /// SIMD_BYTES consecutive rows pass the filter + const auto first = offsets_pos == offsets_begin; + + const auto chunk_offset = first ? 0 : offsets_pos[-1]; + const auto chunk_size = offsets_pos[SIMD_BYTES - 1] - chunk_offset; + + result_offsets_builder.template insertChunk(offsets_pos, first, chunk_offset, chunk_size); + + /// copy elements for SIMD_BYTES arrays at once + const auto elems_size_old = res_elems.size(); + res_elems.resize(elems_size_old + chunk_size); + memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T)); + } + else + { + while (mask) + { + size_t index = __builtin_ctz(mask); + copy_array(offsets_pos + index); + #ifdef __BMI__ + mask = _blsr_u32(mask); + #else + mask = mask & (mask-1); + #endif + } + } + + filt_pos += SIMD_BYTES; + offsets_pos += SIMD_BYTES; + } + #elif defined(__SSE2__) const __m128i zero_vec = _mm_setzero_si128(); static constexpr size_t SIMD_BYTES = 16; const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES; @@ -268,7 +350,7 @@ namespace filt_pos += SIMD_BYTES; offsets_pos += SIMD_BYTES; - } + } #endif while (filt_pos < filt_end)