mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-12 09:22:05 +00:00
ISSUES-1885 UTF8 countCodePoints use simd
This commit is contained in:
parent
c704f8b10c
commit
0f20952f2c
@ -3,6 +3,9 @@
|
||||
#include <Core/Types.h>
|
||||
#include <Common/BitHelpers.h>
|
||||
|
||||
#if __SSE2__
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -49,9 +52,37 @@ inline size_t seqLength(const UInt8 first_octet)
|
||||
inline size_t countCodePoints(const UInt8 * data, size_t size)
|
||||
{
|
||||
size_t res = 0;
|
||||
const auto end = data + size;
|
||||
|
||||
/// TODO SIMD implementation looks quite simple.
|
||||
for (auto end = data + size; data < end; ++data) /// Skip UTF-8 continuation bytes.
|
||||
#if __SSE2__
|
||||
const auto bytes_sse = sizeof(__m128i);
|
||||
const auto src_end_sse = (data + size) - (size % bytes_sse);
|
||||
|
||||
const auto upper_bound = _mm_set1_epi8(0x7F + 1);
|
||||
const auto lower_bound = _mm_set1_epi8(0xC0 - 1);
|
||||
|
||||
for (; data < src_end_sse;)
|
||||
{
|
||||
UInt8 mem_res[16] = {0};
|
||||
auto sse_res = _mm_set1_epi8(0);
|
||||
|
||||
for (int i = 0; i < 0XFF && data < src_end_sse; ++i, data += bytes_sse)
|
||||
{
|
||||
const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data));
|
||||
sse_res = _mm_add_epi8(sse_res,
|
||||
_mm_or_si128(_mm_cmplt_epi8(chars, upper_bound),
|
||||
_mm_cmpgt_epi8(chars, lower_bound)));
|
||||
}
|
||||
|
||||
_mm_store_si128(reinterpret_cast<__m128i *>(mem_res), sse_res);
|
||||
|
||||
for (auto count : mem_res)
|
||||
res += count;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for (; data < end; ++data) /// Skip UTF-8 continuation bytes.
|
||||
res += (*data <= 0x7F || *data >= 0xC0);
|
||||
|
||||
return res;
|
||||
|
Loading…
Reference in New Issue
Block a user