Optimize DB::combineFilters with BMI2/SSE intrinsics

The DB::combineFilters combines two UInt8 columns (filters) by
scanning the first column for non-zero elements and replacing them
with elements in the second column contiguously.

As a follow-up of #56079, the function is optimized with the BMI2
and SSE intrinsics so that the x86 devices without AVX512 support
could also accelerate the column combinations via data parallelism.
This commit is contained in:
Zhiguo Zhou 2024-01-16 15:33:27 +08:00
parent 83f0c3bb99
commit 7450c35189

View File

@ -1283,6 +1283,81 @@ inline void combineFiltersImpl(UInt8 * first_begin, const UInt8 * first_end, con
}
)
/* The BMI2 intrinsic, _pdep_u64 (unsigned __int64 a, unsigned __int64 mask), works
* by copying contiguous low-order bits from unsigned 64-bit integer a to destination
* at the corresponding bit locations specified by mask. To implement the column
* combination with the intrinsic, 8 contiguous bytes would be loaded from second_begin
* as a UInt64 and act the first operand, meanwhile the mask should be constructed from
* first_begin so that the bytes to be replaced (non-zero elements) are mapped to 0xFF
* at the exact bit locations and 0x00 otherwise.
*
* The construction of mask employs the SSE intrinsic, mm_cmpeq_epi8(__m128i a, __m128i
* b), which compares packed 8-bit integers in first_begin and packed 0s and outputs
* 0xFF for equality and 0x00 for inequality. The result's negation then creates the
* desired bit masks for _pdep_u64.
*
* The below example visualizes how this optimization applies to the combination of
* two quadwords from first_begin and second_begin.
*
* Addr high low
* <----------------------------------------
* first_begin............................0x00 0x11 0x12 0x00 0x00 0x13 0x14 0x15
* | mm_cmpeq_epi8(src, 0) | | | | | | | |
* v v v v v v v v v
* inv_mask..............................0xFF 0x00 0x00 0xFF 0xFF 0x00 0x00 0x00
* | (negation) | | | | | | | |
* v v v v v v v v v
* mask-------------------------+......0x00 0xFF 0xFF 0x00 0x00 0xFF 0xFF 0xFF
* | | | | | |
* v v v v v v
* dst = pdep_u64(second_begin, mask)..0x00 0x05 0x04 0x00 0x00 0x03 0x02 0x01
* ^ ^ ^ ^ ^ ^
* | | | | | |
* | | +---------+ | | |
* +------------------+ +---------+ | | | |
* | | | | | |
* second_begin...........................0x00 0x00 0x00 0x05 0x04 0x03 0x02 0x01
*
* References:
* 1. https://www.felixcloutier.com/x86/pdep
* 2. https://www.felixcloutier.com/x86/pcmpeqb:pcmpeqw:pcmpeqd
*/
DECLARE_AVX2_SPECIFIC_CODE(
inline void combineFiltersImpl(UInt8 * first_begin, const UInt8 * first_end, const UInt8 * second_begin)
{
constexpr size_t XMM_VEC_SIZE_IN_BYTES = 16;
const __m128i zero16 = _mm_setzero_si128();
while (first_begin + XMM_VEC_SIZE_IN_BYTES <= first_end)
{
__m128i src = _mm_loadu_si128(reinterpret_cast<__m128i *>(first_begin));
__m128i inv_mask = _mm_cmpeq_epi8(src, zero16);
UInt64 masks[] = {
~static_cast<UInt64>(_mm_extract_epi64(inv_mask, 0)),
~static_cast<UInt64>(_mm_extract_epi64(inv_mask, 1)),
};
for (const auto & mask: masks)
{
UInt64 dst = _pdep_u64(unalignedLoad<UInt64>(second_begin), mask);
unalignedStore<UInt64>(first_begin, dst);
first_begin += sizeof(UInt64);
second_begin += std::popcount(mask) / 8;
}
}
for (/* empty */; first_begin < first_end; ++first_begin)
{
if (*first_begin)
{
*first_begin = *second_begin++;
}
}
}
)
/// Second filter size must be equal to number of 1s in the first filter.
/// The result has size equal to first filter size and contains 1s only where both filters contain 1s.
static ColumnPtr combineFilters(ColumnPtr first, ColumnPtr second)
@ -1330,6 +1405,10 @@ static ColumnPtr combineFilters(ColumnPtr first, ColumnPtr second)
{
TargetSpecific::AVX512VBMI2::combineFiltersImpl(first_data.begin(), first_data.end(), second_data);
}
else if (isArchSupported(TargetArch::AVX2))
{
TargetSpecific::AVX2::combineFiltersImpl(first_data.begin(), first_data.end(), second_data);
}
else
#endif
{