From 383f3a3a20ff7268b4c2fad14d0e20f266682b77 Mon Sep 17 00:00:00 2001 From: vesslanjin Date: Fri, 8 Oct 2021 09:19:58 -0400 Subject: [PATCH] =?UTF-8?q?Remove=C2=A0branchy=C2=A0code=C2=A0in=C2=A0filt?= =?UTF-8?q?er=C2=A0operation=C2=A0with=C2=A0a=C2=A0better=C2=A0implementat?= =?UTF-8?q?ion=C2=A0with=C2=A0popcnt/ctz=C2=A0which=C2=A0have=C2=A0better?= =?UTF-8?q?=C2=A0performance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Zhu Jasper --- src/Columns/ColumnFixedString.cpp | 22 +++++++++------------- src/Columns/ColumnVector.cpp | 11 +++++++---- src/Columns/ColumnsCommon.cpp | 11 +++++++---- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index ce39ab0994c..4bfc6513263 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -231,7 +231,7 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result const UInt8 * filt_end = filt_pos + col_size; const UInt8 * data_pos = chars.data(); -#ifdef __SSE2__ +#if defined(__SSE2__) && defined(__POPCNT__) /** A slightly more optimized version. * Based on the assumption that often pieces of consecutive values * completely pass or do not pass the filter. @@ -251,28 +251,24 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result if (0 == mask) { /// Nothing is inserted. - data_pos += chars_per_simd_elements; } else if (0xFFFF == mask) { res->chars.insert(data_pos, data_pos + chars_per_simd_elements); - data_pos += chars_per_simd_elements; } else { size_t res_chars_size = res->chars.size(); - for (size_t i = 0; i < SIMD_BYTES; ++i) - { - if (filt_pos[i]) - { - res->chars.resize(res_chars_size + n); - memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos, n); - res_chars_size += n; - } - data_pos += n; + size_t pcnt = __builtin_popcount(mask); + for(size_t j = 0; j < pcnt; j++) { + size_t index = __builtin_ctz(mask); + res->chars.resize(res_chars_size + n); + memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos+index*n, n); + res_chars_size += n; + mask = mask & (mask-1); } } - + data_pos += chars_per_simd_elements; filt_pos += SIMD_BYTES; } #endif diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 7f3cdaeec7f..ff84204a7cb 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -311,7 +311,7 @@ ColumnPtr ColumnVector::filter(const IColumn::Filter & filt, ssize_t result_s const UInt8 * filt_end = filt_pos + size; const T * data_pos = data.data(); -#ifdef __SSE2__ +#if defined(__SSE2__) && defined(__POPCNT__) /** A slightly more optimized version. * Based on the assumption that often pieces of consecutive values * completely pass or do not pass the filter. @@ -337,9 +337,12 @@ ColumnPtr ColumnVector::filter(const IColumn::Filter & filt, ssize_t result_s } else { - for (size_t i = 0; i < SIMD_BYTES; ++i) - if (filt_pos[i]) - res_data.push_back(data_pos[i]); + size_t pcnt = __builtin_popcount(mask); + for(size_t j = 0; j < pcnt; j++) { + size_t index = __builtin_ctz(mask); + res_data.push_back(data_pos[index]); + mask = mask & (mask-1); + } } filt_pos += SIMD_BYTES; diff --git a/src/Columns/ColumnsCommon.cpp b/src/Columns/ColumnsCommon.cpp index 41933ed08ed..5c0214054b2 100644 --- a/src/Columns/ColumnsCommon.cpp +++ b/src/Columns/ColumnsCommon.cpp @@ -229,7 +229,7 @@ namespace memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T)); }; - #ifdef __SSE2__ + #if defined(__SSE2__) && defined(__POPCNT__) const __m128i zero_vec = _mm_setzero_si128(); static constexpr size_t SIMD_BYTES = 16; const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES; @@ -262,9 +262,12 @@ namespace } else { - for (size_t i = 0; i < SIMD_BYTES; ++i) - if (filt_pos[i]) - copy_array(offsets_pos + i); + size_t pcnt = __builtin_popcount(mask); + for(size_t j = 0; j < pcnt; j++) { + size_t index = __builtin_ctz(mask); + copy_array(offsets_pos + index); + mask = mask & (mask-1); + } } filt_pos += SIMD_BYTES;