From 0f20952f2ce98329ed64c567484272905bcfc6d9 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sat, 10 Feb 2018 17:21:54 +0800 Subject: [PATCH] ISSUES-1885 UTF8 countCodePoints use simd --- dbms/src/Common/UTF8Helpers.h | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/dbms/src/Common/UTF8Helpers.h b/dbms/src/Common/UTF8Helpers.h index 1ce31426e85..ba1685c89f3 100644 --- a/dbms/src/Common/UTF8Helpers.h +++ b/dbms/src/Common/UTF8Helpers.h @@ -3,6 +3,9 @@ #include #include +#if __SSE2__ +#include +#endif namespace DB { @@ -49,9 +52,37 @@ inline size_t seqLength(const UInt8 first_octet) inline size_t countCodePoints(const UInt8 * data, size_t size) { size_t res = 0; + const auto end = data + size; - /// TODO SIMD implementation looks quite simple. - for (auto end = data + size; data < end; ++data) /// Skip UTF-8 continuation bytes. +#if __SSE2__ + const auto bytes_sse = sizeof(__m128i); + const auto src_end_sse = (data + size) - (size % bytes_sse); + + const auto upper_bound = _mm_set1_epi8(0x7F + 1); + const auto lower_bound = _mm_set1_epi8(0xC0 - 1); + + for (; data < src_end_sse;) + { + UInt8 mem_res[16] = {0}; + auto sse_res = _mm_set1_epi8(0); + + for (int i = 0; i < 0XFF && data < src_end_sse; ++i, data += bytes_sse) + { + const auto chars = _mm_loadu_si128(reinterpret_cast(data)); + sse_res = _mm_add_epi8(sse_res, + _mm_or_si128(_mm_cmplt_epi8(chars, upper_bound), + _mm_cmpgt_epi8(chars, lower_bound))); + } + + _mm_store_si128(reinterpret_cast<__m128i *>(mem_res), sse_res); + + for (auto count : mem_res) + res += count; + } + +#endif + + for (; data < end; ++data) /// Skip UTF-8 continuation bytes. res += (*data <= 0x7F || *data >= 0xC0); return res;