mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-24 10:40:49 +00:00
Remove branchy code in filter operation with a better implementation with popcnt/ctz which have better performance
Co-authored-by: Zhu Jasper <jasper.zhu@intel.com>
This commit is contained in:
parent
2683da5621
commit
383f3a3a20
@ -231,7 +231,7 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
|
||||
const UInt8 * filt_end = filt_pos + col_size;
|
||||
const UInt8 * data_pos = chars.data();
|
||||
|
||||
#ifdef __SSE2__
|
||||
#if defined(__SSE2__) && defined(__POPCNT__)
|
||||
/** A slightly more optimized version.
|
||||
* Based on the assumption that often pieces of consecutive values
|
||||
* completely pass or do not pass the filter.
|
||||
@ -251,28 +251,24 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
|
||||
if (0 == mask)
|
||||
{
|
||||
/// Nothing is inserted.
|
||||
data_pos += chars_per_simd_elements;
|
||||
}
|
||||
else if (0xFFFF == mask)
|
||||
{
|
||||
res->chars.insert(data_pos, data_pos + chars_per_simd_elements);
|
||||
data_pos += chars_per_simd_elements;
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t res_chars_size = res->chars.size();
|
||||
for (size_t i = 0; i < SIMD_BYTES; ++i)
|
||||
{
|
||||
if (filt_pos[i])
|
||||
{
|
||||
res->chars.resize(res_chars_size + n);
|
||||
memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos, n);
|
||||
res_chars_size += n;
|
||||
}
|
||||
data_pos += n;
|
||||
size_t pcnt = __builtin_popcount(mask);
|
||||
for(size_t j = 0; j < pcnt; j++) {
|
||||
size_t index = __builtin_ctz(mask);
|
||||
res->chars.resize(res_chars_size + n);
|
||||
memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos+index*n, n);
|
||||
res_chars_size += n;
|
||||
mask = mask & (mask-1);
|
||||
}
|
||||
}
|
||||
|
||||
data_pos += chars_per_simd_elements;
|
||||
filt_pos += SIMD_BYTES;
|
||||
}
|
||||
#endif
|
||||
|
@ -311,7 +311,7 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
|
||||
const UInt8 * filt_end = filt_pos + size;
|
||||
const T * data_pos = data.data();
|
||||
|
||||
#ifdef __SSE2__
|
||||
#if defined(__SSE2__) && defined(__POPCNT__)
|
||||
/** A slightly more optimized version.
|
||||
* Based on the assumption that often pieces of consecutive values
|
||||
* completely pass or do not pass the filter.
|
||||
@ -337,9 +337,12 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < SIMD_BYTES; ++i)
|
||||
if (filt_pos[i])
|
||||
res_data.push_back(data_pos[i]);
|
||||
size_t pcnt = __builtin_popcount(mask);
|
||||
for(size_t j = 0; j < pcnt; j++) {
|
||||
size_t index = __builtin_ctz(mask);
|
||||
res_data.push_back(data_pos[index]);
|
||||
mask = mask & (mask-1);
|
||||
}
|
||||
}
|
||||
|
||||
filt_pos += SIMD_BYTES;
|
||||
|
@ -229,7 +229,7 @@ namespace
|
||||
memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T));
|
||||
};
|
||||
|
||||
#ifdef __SSE2__
|
||||
#if defined(__SSE2__) && defined(__POPCNT__)
|
||||
const __m128i zero_vec = _mm_setzero_si128();
|
||||
static constexpr size_t SIMD_BYTES = 16;
|
||||
const auto * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
@ -262,9 +262,12 @@ namespace
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < SIMD_BYTES; ++i)
|
||||
if (filt_pos[i])
|
||||
copy_array(offsets_pos + i);
|
||||
size_t pcnt = __builtin_popcount(mask);
|
||||
for(size_t j = 0; j < pcnt; j++) {
|
||||
size_t index = __builtin_ctz(mask);
|
||||
copy_array(offsets_pos + index);
|
||||
mask = mask & (mask-1);
|
||||
}
|
||||
}
|
||||
|
||||
filt_pos += SIMD_BYTES;
|
||||
|
Loading…
Reference in New Issue
Block a user