diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 8a986c1ca86..7cfb90d4371 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -577,6 +577,113 @@ ColumnPtr ColumnVector::index(const IColumn & indexes, size_t limit) const return selectIndexImpl(*this, indexes, limit); } +#ifdef __SSE2__ + +namespace +{ + /** Optimization for ColumnVector replicate using SIMD instructions. + * For such optimization it is important that data is right padded with 15 bytes. + * + * Replicate span size is offsets[i] - offsets[i - 1]. + * + * Split spans into 3 categories. + * 1. Span with 0 size. Continue iteration. + * + * 2. Span with 1 size. Update pointer from which data must be copied into result. + * Then if we see span with size 1 or greater than 1 copy data directly into result data and reset pointer. + * Example: + * Data: 1 2 3 4 + * Offsets: 1 2 3 4 + * Result data: 1 2 3 4 + * + * 3. Span with size greater than 1. Save single data element into register and copy it into result data. + * Example: + * Data: 1 2 3 4 + * Offsets: 4 4 4 4 + * Result data: 1 1 1 1 + * + * Additional handling for tail is needed if pointer from which data must be copied from span with size 1 is not null. + */ + template + requires (std::is_same_v || std::is_same_v) + void replicateSSE42Int32(const IntType * __restrict data, IntType * __restrict result_data, const IColumn::Offsets & offsets) + { + const IntType * data_copy_begin_ptr = nullptr; + size_t offsets_size = offsets.size(); + + for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index) + { + size_t span = offsets[offset_index] - offsets[offset_index - 1]; + if (span == 1) + { + if (!data_copy_begin_ptr) + data_copy_begin_ptr = data + offset_index; + + continue; + } + + /// Copy data + + if (data_copy_begin_ptr) + { + size_t copy_size = (data + offset_index) - data_copy_begin_ptr; + bool remainder = copy_size % 4; + size_t sse_copy_counter = (copy_size / 4) + remainder; + auto * result_data_copy = result_data; + + while (sse_copy_counter) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(result_data_copy), *(reinterpret_cast(data_copy_begin_ptr))); + result_data_copy += 4; + data_copy_begin_ptr += 4; + --sse_copy_counter; + } + + result_data += copy_size; + data_copy_begin_ptr = nullptr; + } + + if (span == 0) + continue; + + /// Copy single data element into result data + + bool span_remainder = span % 4; + size_t copy_counter = (span / 4) + span_remainder; + auto * result_data_tmp = result_data; + __m128i copy_element_data = _mm_set1_epi32(data[offset_index]); + + while (copy_counter) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(result_data_tmp), copy_element_data); + result_data_tmp += 4; + --copy_counter; + } + + result_data += span; + } + + /// Copy tail if needed + + if (data_copy_begin_ptr) + { + size_t copy_size = (data + offsets_size) - data_copy_begin_ptr; + bool remainder = copy_size % 4; + size_t sse_copy_counter = (copy_size / 4) + remainder; + + while (sse_copy_counter) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(result_data), *(reinterpret_cast(data_copy_begin_ptr))); + result_data += 4; + data_copy_begin_ptr += 4; + --sse_copy_counter; + } + } + } +} + +#endif + template ColumnPtr ColumnVector::replicate(const IColumn::Offsets & offsets) const { @@ -587,13 +694,16 @@ ColumnPtr ColumnVector::replicate(const IColumn::Offsets & offsets) const if (0 == size) return this->create(); + auto res = this->create(offsets.back()); + #ifdef __SSE2__ if constexpr (std::is_same_v) - return replicateSSE2(offsets); + { + replicateSSE42Int32(getData().data(), res->getData().data(), offsets); + return res; + } #endif - auto res = this->create(offsets.back()); - auto it = res->getData().begin(); // NOLINT for (size_t i = 0; i < size; ++i) { @@ -605,110 +715,6 @@ ColumnPtr ColumnVector::replicate(const IColumn::Offsets & offsets) const return res; } - -#ifdef __SSE2__ - -template -ColumnPtr ColumnVector::replicateSSE2(const IColumn::Offsets & offsets) const -{ - auto res = this->create(offsets.back()); - - auto it = res->getData().begin(); // NOLINT - - /// Column is using PaddedPODArray, so we don't have to worry about the 4 out of range elements. - - IColumn::Offset prev_offset = 0; - std::optional copy_begin; - size_t size = offsets.size(); - for (size_t i = 0; i < size; ++i) - { - size_t span = offsets[i] - prev_offset; - prev_offset = offsets[i]; - if (span == 1) - { - if (!copy_begin) - copy_begin = i; - continue; - } - - /// data : 11 22 33 44 55 - /// offsets: 0 1 2 3 3 - /// res: 22 33 44 - if (copy_begin) - { - size_t copy_size = i - (*copy_begin); - bool remain = (copy_size & 3); - size_t sse_copy_counter = (copy_size >> 2); - sse_copy_counter = remain * (sse_copy_counter + 1) + (!remain) * (sse_copy_counter); - auto it_tmp = it; // NOLINT - size_t data_start = *copy_begin; - copy_begin.reset(); - constexpr const int copy_mask = _MM_SHUFFLE(3, 2, 1, 0); - while (sse_copy_counter) - { - __m128i data_to_copy = _mm_loadu_si128(reinterpret_cast(&data[data_start])); - auto copy_result = _mm_shuffle_epi32(data_to_copy, copy_mask); - _mm_storeu_si128(reinterpret_cast<__m128i *>(it_tmp), copy_result); - it_tmp += 4; - data_start += 4; - --sse_copy_counter; - } - - it += copy_size; - } - - if (span == 0) - continue; - - /// data : 11 22 33 - /// offsets: 0 0 4 - /// res: 33 33 33 33 - size_t shuffle_size = span; - bool shuffle_remain = (shuffle_size & 3); - size_t sse_shuffle_counter = (shuffle_size >> 2); - sse_shuffle_counter = shuffle_remain * (sse_shuffle_counter + 1) + (!shuffle_remain) * (sse_shuffle_counter); - auto it_tmp = it; // NOLINT - constexpr const int shuffle_mask = (_MM_SHUFFLE(0, 0, 0, 0)); - __m128i data_to_shuffle = _mm_loadu_si128(reinterpret_cast(&data[i])); - auto shuffle_result = _mm_shuffle_epi32(data_to_shuffle, shuffle_mask); - while (sse_shuffle_counter) - { - _mm_storeu_si128(reinterpret_cast<__m128i *>(it_tmp), shuffle_result); - it_tmp += 4; - --sse_shuffle_counter; - } - it += shuffle_size; - } - - /// data : 11 22 33 44 55 - /// offsets: 1 2 3 4 5 - /// res: 11 22 33 44 55 - if (copy_begin) - { - size_t copy_size = (size - (*copy_begin)); - bool remain = (copy_size & 3); - size_t sse_copy_counter = (copy_size >> 2); - sse_copy_counter = remain * (sse_copy_counter + 1) + (!remain) * (sse_copy_counter); - auto it_tmp = it; // NOLINT - size_t data_start = *copy_begin; - constexpr const int copy_mask = (_MM_SHUFFLE(3, 2, 1, 0)); - while (sse_copy_counter) - { - __m128i data_to_copy = _mm_loadu_si128(reinterpret_cast(&data[data_start])); - auto copy_result = _mm_shuffle_epi32(data_to_copy, copy_mask); - _mm_storeu_si128(reinterpret_cast<__m128i *>(it_tmp), copy_result); - it_tmp += 4; - data_start += 4; - --sse_copy_counter; - } - it += copy_size; - } - - return res; -} -#endif - - template void ColumnVector::gather(ColumnGathererStream & gatherer) { diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index 6aae3a3e3fb..6ba9abaca32 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -136,9 +136,7 @@ private: /// Sugar constructor. ColumnVector(std::initializer_list il) : data{il} {} - #ifdef __SSE2__ - ColumnPtr replicateSSE2(const IColumn::Offsets & offsets) const; - #endif + public: bool isNumeric() const override { return is_arithmetic_v; }