ISSUES-1885 UTF8 countCodePoints use simd

This commit is contained in:
zhang2014 2018-02-10 17:21:54 +08:00
parent c704f8b10c
commit 0f20952f2c

View File

@ -3,6 +3,9 @@
#include <Core/Types.h>
#include <Common/BitHelpers.h>
#if __SSE2__
#include <emmintrin.h>
#endif
namespace DB
{
@ -49,9 +52,37 @@ inline size_t seqLength(const UInt8 first_octet)
inline size_t countCodePoints(const UInt8 * data, size_t size)
{
size_t res = 0;
const auto end = data + size;
/// TODO SIMD implementation looks quite simple.
for (auto end = data + size; data < end; ++data) /// Skip UTF-8 continuation bytes.
#if __SSE2__
const auto bytes_sse = sizeof(__m128i);
const auto src_end_sse = (data + size) - (size % bytes_sse);
const auto upper_bound = _mm_set1_epi8(0x7F + 1);
const auto lower_bound = _mm_set1_epi8(0xC0 - 1);
for (; data < src_end_sse;)
{
UInt8 mem_res[16] = {0};
auto sse_res = _mm_set1_epi8(0);
for (int i = 0; i < 0XFF && data < src_end_sse; ++i, data += bytes_sse)
{
const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data));
sse_res = _mm_add_epi8(sse_res,
_mm_or_si128(_mm_cmplt_epi8(chars, upper_bound),
_mm_cmpgt_epi8(chars, lower_bound)));
}
_mm_store_si128(reinterpret_cast<__m128i *>(mem_res), sse_res);
for (auto count : mem_res)
res += count;
}
#endif
for (; data < end; ++data) /// Skip UTF-8 continuation bytes.
res += (*data <= 0x7F || *data >= 0xC0);
return res;