Merge pull request #38565 from ClickHouse/merging-column-vector-replicate-optimize

Merging #37235
This commit is contained in:
Maksim Kita 2022-06-30 18:29:51 +02:00 committed by GitHub
commit 3a1d8f4518
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -577,6 +577,115 @@ ColumnPtr ColumnVector<T>::index(const IColumn & indexes, size_t limit) const
return selectIndexImpl(*this, indexes, limit);
}
#ifdef __SSE2__
namespace
{
/** Optimization for ColumnVector replicate using SIMD instructions.
* For such optimization it is important that data is right padded with 15 bytes.
*
* Replicate span size is offsets[i] - offsets[i - 1].
*
* Split spans into 3 categories.
* 1. Span with 0 size. Continue iteration.
*
* 2. Span with 1 size. Update pointer from which data must be copied into result.
* Then if we see span with size 1 or greater than 1 copy data directly into result data and reset pointer.
* Example:
* Data: 1 2 3 4
* Offsets: 1 2 3 4
* Result data: 1 2 3 4
*
* 3. Span with size greater than 1. Save single data element into register and copy it into result data.
* Example:
* Data: 1 2 3 4
* Offsets: 4 4 4 4
* Result data: 1 1 1 1
*
* Additional handling for tail is needed if pointer from which data must be copied from span with size 1 is not null.
*/
template<typename IntType>
requires (std::is_same_v<IntType, Int32> || std::is_same_v<IntType, UInt32>)
void replicateSSE42Int32(const IntType * __restrict data, IntType * __restrict result_data, const IColumn::Offsets & offsets)
{
const IntType * data_copy_begin_ptr = nullptr;
size_t offsets_size = offsets.size();
for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index)
{
size_t span = offsets[offset_index] - offsets[offset_index - 1];
if (span == 1)
{
if (!data_copy_begin_ptr)
data_copy_begin_ptr = data + offset_index;
continue;
}
/// Copy data
if (data_copy_begin_ptr)
{
size_t copy_size = (data + offset_index) - data_copy_begin_ptr;
bool remainder = copy_size % 4;
size_t sse_copy_counter = (copy_size / 4) + remainder;
auto * result_data_copy = result_data;
while (sse_copy_counter)
{
__m128i copy_batch = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data_copy_begin_ptr));
_mm_storeu_si128(reinterpret_cast<__m128i *>(result_data_copy), copy_batch);
result_data_copy += 4;
data_copy_begin_ptr += 4;
--sse_copy_counter;
}
result_data += copy_size;
data_copy_begin_ptr = nullptr;
}
if (span == 0)
continue;
/// Copy single data element into result data
bool span_remainder = span % 4;
size_t copy_counter = (span / 4) + span_remainder;
auto * result_data_tmp = result_data;
__m128i copy_element_data = _mm_set1_epi32(data[offset_index]);
while (copy_counter)
{
_mm_storeu_si128(reinterpret_cast<__m128i *>(result_data_tmp), copy_element_data);
result_data_tmp += 4;
--copy_counter;
}
result_data += span;
}
/// Copy tail if needed
if (data_copy_begin_ptr)
{
size_t copy_size = (data + offsets_size) - data_copy_begin_ptr;
bool remainder = copy_size % 4;
size_t sse_copy_counter = (copy_size / 4) + remainder;
while (sse_copy_counter)
{
__m128i copy_batch = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data_copy_begin_ptr));
_mm_storeu_si128(reinterpret_cast<__m128i *>(result_data), copy_batch);
result_data += 4;
data_copy_begin_ptr += 4;
--sse_copy_counter;
}
}
}
}
#endif
template <typename T>
ColumnPtr ColumnVector<T>::replicate(const IColumn::Offsets & offsets) const
{
@ -589,6 +698,14 @@ ColumnPtr ColumnVector<T>::replicate(const IColumn::Offsets & offsets) const
auto res = this->create(offsets.back());
#ifdef __SSE2__
if constexpr (std::is_same_v<T, UInt32>)
{
replicateSSE42Int32(getData().data(), res->getData().data(), offsets);
return res;
}
#endif
auto it = res->getData().begin(); // NOLINT
for (size_t i = 0; i < size; ++i)
{