Remove branchy code in filter operation with a better implementation with popcnt/ctz which have better performance

Co-authored-by: Zhu Jasper <jasper.zhu@intel.com>
This commit is contained in:
vesslanjin 2021-10-08 09:19:58 -04:00
parent 2683da5621
commit 383f3a3a20
3 changed files with 23 additions and 21 deletions

View File

@ -231,7 +231,7 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
const UInt8 * filt_end = filt_pos + col_size;
const UInt8 * data_pos = chars.data();
#ifdef __SSE2__
#if defined(__SSE2__) && defined(__POPCNT__)
/** A slightly more optimized version.
* Based on the assumption that often pieces of consecutive values
* completely pass or do not pass the filter.
@ -251,28 +251,24 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
if (0 == mask)
{
/// Nothing is inserted.
data_pos += chars_per_simd_elements;
}
else if (0xFFFF == mask)
{
res->chars.insert(data_pos, data_pos + chars_per_simd_elements);
data_pos += chars_per_simd_elements;
}
else
{
size_t res_chars_size = res->chars.size();
for (size_t i = 0; i < SIMD_BYTES; ++i)
{
if (filt_pos[i])
{
res->chars.resize(res_chars_size + n);
memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos, n);
res_chars_size += n;
}
data_pos += n;
size_t pcnt = __builtin_popcount(mask);
for(size_t j = 0; j < pcnt; j++) {
size_t index = __builtin_ctz(mask);
res->chars.resize(res_chars_size + n);
memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos+index*n, n);
res_chars_size += n;
mask = mask & (mask-1);
}
}
data_pos += chars_per_simd_elements;
filt_pos += SIMD_BYTES;
}
#endif

View File

@ -311,7 +311,7 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
const UInt8 * filt_end = filt_pos + size;
const T * data_pos = data.data();
#ifdef __SSE2__
#if defined(__SSE2__) && defined(__POPCNT__)
/** A slightly more optimized version.
* Based on the assumption that often pieces of consecutive values
* completely pass or do not pass the filter.
@ -337,9 +337,12 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
}
else
{
for (size_t i = 0; i < SIMD_BYTES; ++i)
if (filt_pos[i])
res_data.push_back(data_pos[i]);
size_t pcnt = __builtin_popcount(mask);
for(size_t j = 0; j < pcnt; j++) {
size_t index = __builtin_ctz(mask);
res_data.push_back(data_pos[index]);
mask = mask & (mask-1);
}
}
filt_pos += SIMD_BYTES;

View File

@ -229,7 +229,7 @@ namespace
memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T));
};
#ifdef __SSE2__
#if defined(__SSE2__) && defined(__POPCNT__)
const __m128i zero_vec = _mm_setzero_si128();
static constexpr size_t SIMD_BYTES = 16;
const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
@ -262,9 +262,12 @@ namespace
}
else
{
for (size_t i = 0; i < SIMD_BYTES; ++i)
if (filt_pos[i])
copy_array(offsets_pos + i);
size_t pcnt = __builtin_popcount(mask);
for(size_t j = 0; j < pcnt; j++) {
size_t index = __builtin_ctz(mask);
copy_array(offsets_pos + index);
mask = mask & (mask-1);
}
}
filt_pos += SIMD_BYTES;