mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 17:12:03 +00:00
Merge pull request #38565 from ClickHouse/merging-column-vector-replicate-optimize
Merging #37235
This commit is contained in:
commit
3a1d8f4518
@ -489,10 +489,10 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
|
||||
const T * data_pos = data.data();
|
||||
|
||||
/** A slightly more optimized version.
|
||||
* Based on the assumption that often pieces of consecutive values
|
||||
* completely pass or do not pass the filter.
|
||||
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
|
||||
*/
|
||||
* Based on the assumption that often pieces of consecutive values
|
||||
* completely pass or do not pass the filter.
|
||||
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
|
||||
*/
|
||||
static constexpr size_t SIMD_BYTES = 64;
|
||||
const UInt8 * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
|
||||
@ -577,6 +577,115 @@ ColumnPtr ColumnVector<T>::index(const IColumn & indexes, size_t limit) const
|
||||
return selectIndexImpl(*this, indexes, limit);
|
||||
}
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
namespace
|
||||
{
|
||||
/** Optimization for ColumnVector replicate using SIMD instructions.
|
||||
* For such optimization it is important that data is right padded with 15 bytes.
|
||||
*
|
||||
* Replicate span size is offsets[i] - offsets[i - 1].
|
||||
*
|
||||
* Split spans into 3 categories.
|
||||
* 1. Span with 0 size. Continue iteration.
|
||||
*
|
||||
* 2. Span with 1 size. Update pointer from which data must be copied into result.
|
||||
* Then if we see span with size 1 or greater than 1 copy data directly into result data and reset pointer.
|
||||
* Example:
|
||||
* Data: 1 2 3 4
|
||||
* Offsets: 1 2 3 4
|
||||
* Result data: 1 2 3 4
|
||||
*
|
||||
* 3. Span with size greater than 1. Save single data element into register and copy it into result data.
|
||||
* Example:
|
||||
* Data: 1 2 3 4
|
||||
* Offsets: 4 4 4 4
|
||||
* Result data: 1 1 1 1
|
||||
*
|
||||
* Additional handling for tail is needed if pointer from which data must be copied from span with size 1 is not null.
|
||||
*/
|
||||
template<typename IntType>
|
||||
requires (std::is_same_v<IntType, Int32> || std::is_same_v<IntType, UInt32>)
|
||||
void replicateSSE42Int32(const IntType * __restrict data, IntType * __restrict result_data, const IColumn::Offsets & offsets)
|
||||
{
|
||||
const IntType * data_copy_begin_ptr = nullptr;
|
||||
size_t offsets_size = offsets.size();
|
||||
|
||||
for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index)
|
||||
{
|
||||
size_t span = offsets[offset_index] - offsets[offset_index - 1];
|
||||
if (span == 1)
|
||||
{
|
||||
if (!data_copy_begin_ptr)
|
||||
data_copy_begin_ptr = data + offset_index;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
/// Copy data
|
||||
|
||||
if (data_copy_begin_ptr)
|
||||
{
|
||||
size_t copy_size = (data + offset_index) - data_copy_begin_ptr;
|
||||
bool remainder = copy_size % 4;
|
||||
size_t sse_copy_counter = (copy_size / 4) + remainder;
|
||||
auto * result_data_copy = result_data;
|
||||
|
||||
while (sse_copy_counter)
|
||||
{
|
||||
__m128i copy_batch = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data_copy_begin_ptr));
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(result_data_copy), copy_batch);
|
||||
result_data_copy += 4;
|
||||
data_copy_begin_ptr += 4;
|
||||
--sse_copy_counter;
|
||||
}
|
||||
|
||||
result_data += copy_size;
|
||||
data_copy_begin_ptr = nullptr;
|
||||
}
|
||||
|
||||
if (span == 0)
|
||||
continue;
|
||||
|
||||
/// Copy single data element into result data
|
||||
|
||||
bool span_remainder = span % 4;
|
||||
size_t copy_counter = (span / 4) + span_remainder;
|
||||
auto * result_data_tmp = result_data;
|
||||
__m128i copy_element_data = _mm_set1_epi32(data[offset_index]);
|
||||
|
||||
while (copy_counter)
|
||||
{
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(result_data_tmp), copy_element_data);
|
||||
result_data_tmp += 4;
|
||||
--copy_counter;
|
||||
}
|
||||
|
||||
result_data += span;
|
||||
}
|
||||
|
||||
/// Copy tail if needed
|
||||
|
||||
if (data_copy_begin_ptr)
|
||||
{
|
||||
size_t copy_size = (data + offsets_size) - data_copy_begin_ptr;
|
||||
bool remainder = copy_size % 4;
|
||||
size_t sse_copy_counter = (copy_size / 4) + remainder;
|
||||
|
||||
while (sse_copy_counter)
|
||||
{
|
||||
__m128i copy_batch = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data_copy_begin_ptr));
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(result_data), copy_batch);
|
||||
result_data += 4;
|
||||
data_copy_begin_ptr += 4;
|
||||
--sse_copy_counter;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
ColumnPtr ColumnVector<T>::replicate(const IColumn::Offsets & offsets) const
|
||||
{
|
||||
@ -589,6 +698,14 @@ ColumnPtr ColumnVector<T>::replicate(const IColumn::Offsets & offsets) const
|
||||
|
||||
auto res = this->create(offsets.back());
|
||||
|
||||
#ifdef __SSE2__
|
||||
if constexpr (std::is_same_v<T, UInt32>)
|
||||
{
|
||||
replicateSSE42Int32(getData().data(), res->getData().data(), offsets);
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
|
||||
auto it = res->getData().begin(); // NOLINT
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user